diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index f8074711b88..005792d187f 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -53,7 +53,6 @@ requirements: - cython >=0.29,<0.30 - scikit-build >=0.13.1 - setuptools - - numba >=0.56.4,<0.57 - dlpack >=0.5,<0.6.0a0 - pyarrow =11 - libcudf ={{ version }} @@ -69,7 +68,6 @@ requirements: - numpy >=1.21,<1.24 # Temporarily upper bound numpy to avoid overflow deprecations - {{ pin_compatible('pyarrow', max_pin='x.x.x') }} - libcudf {{ version }} - - fastavro >=0.22.0 - {{ pin_compatible('rmm', max_pin='x.x') }} - fsspec >=0.6.0 - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }} diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index bdc72cc4535..95f0db895a8 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -153,8 +153,12 @@ ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp # ################################################################################################## # * stream_compaction benchmark ------------------------------------------------------------------- ConfigureNVBench( - STREAM_COMPACTION_NVBENCH stream_compaction/distinct.cpp stream_compaction/distinct_count.cpp - stream_compaction/unique.cpp stream_compaction/unique_count.cpp + STREAM_COMPACTION_NVBENCH + stream_compaction/distinct.cpp + stream_compaction/distinct_count.cpp + stream_compaction/stable_distinct.cpp + stream_compaction/unique.cpp + stream_compaction/unique_count.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/stream_compaction/stable_distinct.cpp b/cpp/benchmarks/stream_compaction/stable_distinct.cpp new file mode 100644 index 00000000000..6a9542c83a6 --- /dev/null +++ b/cpp/benchmarks/stream_compaction/stable_distinct.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include + +#include + +NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms"); + +template +void nvbench_stable_distinct(nvbench::state& state, nvbench::type_list) +{ + cudf::size_type const num_rows = state.get_int64("NumRows"); + + data_profile profile = data_profile_builder().cardinality(0).null_probability(0.01).distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + + auto source_column = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + + auto input_column = source_column->view(); + auto input_table = cudf::table_view({input_column, input_column, input_column, input_column}); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = cudf::stable_distinct(input_table, + {0}, + cudf::duplicate_keep_option::KEEP_ANY, + cudf::null_equality::EQUAL, + cudf::nan_equality::ALL_EQUAL); + }); +} + +using data_type = nvbench::type_list; + +NVBENCH_BENCH_TYPES(nvbench_stable_distinct, NVBENCH_TYPE_AXES(data_type)) + .set_name("stable_distinct") + .set_type_axes_names({"Type"}) + .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000}); + +template +void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list) +{ + auto const size = state.get_int64("ColumnSize"); + auto const dtype = cudf::type_to_id(); + double const null_probability = state.get_float64("null_probability"); + + auto builder = data_profile_builder().null_probability(null_probability); + if (dtype == cudf::type_id::LIST) { + builder.distribution(dtype, distribution_id::UNIFORM, 0, 4) + .distribution(cudf::type_id::INT32, distribution_id::UNIFORM, 0, 4) + .list_depth(1); + } else { + // We're comparing stable_distinct() on a non-nested column to that on a list column with the + // same number of stable_distinct rows. The max list size is 4 and the number of distinct values + // in the list's child is 5. So the number of distinct rows in the list = 1 + 5 + 5^2 + 5^3 + + // 5^4 = 781 We want this column to also have 781 distinct values. + builder.distribution(dtype, distribution_id::UNIFORM, 0, 781); + } + + auto const table = create_random_table( + {dtype}, table_size_bytes{static_cast(size)}, data_profile{builder}, 0); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = cudf::stable_distinct(*table, + {0}, + cudf::duplicate_keep_option::KEEP_ANY, + cudf::null_equality::EQUAL, + cudf::nan_equality::ALL_EQUAL); + }); +} + +NVBENCH_BENCH_TYPES(nvbench_stable_distinct_list, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("stable_distinct_list") + .set_type_axes_names({"Type"}) + .add_float64_axis("null_probability", {0.0, 0.1}) + .add_int64_axis("ColumnSize", {100'000'000}); diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index e0fc7b71cd9..5476000fc29 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -86,24 +86,9 @@ std::unique_ptr distinct(table_view const& input, rmm::mr::device_memory_resource* mr); /** - * @brief Create a new table without duplicate rows. + * @copydoc cudf::stable_distinct * - * Given an `input` table_view, each row is copied to the output table to create a set of distinct - * rows. The row order is guaranteed to be preserved as in the input. - * - * If there are duplicate rows, which row to be copied depends on the specified value of the `keep` - * parameter. - * - * This API produces exactly the same set of output rows as `cudf::distinct`. - * - * @param input The input table - * @param keys Vector of indices indicating key columns in the `input` table - * @param keep Copy any, first, last, or none of the found duplicates - * @param nulls_equal Flag to specify whether null elements should be considered as equal - * @param nans_equal Flag to specify whether NaN elements should be considered as equal - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned table - * @return A table containing the resulting distinct rows + * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr
stable_distinct(table_view const& input, std::vector const& keys, diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index e2a6b97256f..984e3037cd1 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -254,22 +254,19 @@ std::unique_ptr
unique( * @brief Create a new table without duplicate rows. * * Given an `input` table_view, each row is copied to the output table to create a set of distinct - * rows. If there are duplicate rows, which row to be copied depends on the specified value of - * the `keep` parameter. + * rows. If there are duplicate rows, which row is copied depends on the `keep` parameter. * * The order of rows in the output table is not specified. * * Performance hint: if the input is pre-sorted, `cudf::unique` can produce an equivalent result * (i.e., same set of output rows) but with less running time than `cudf::distinct`. * - * @param[in] input input table_view to copy only distinct rows - * @param[in] keys vector of indices representing key columns from `input` - * @param[in] keep keep any, first, last, or none of the found duplicates - * @param[in] nulls_equal flag to control if nulls are compared equal or not - * @param[in] nans_equal flag to control if floating-point NaN values are compared equal or not - * @param[in] mr Device memory resource used to allocate the returned table's device - * memory - * + * @param input The input table + * @param keys Vector of indices indicating key columns in the `input` table + * @param keep Copy any, first, last, or none of the found duplicates + * @param nulls_equal Flag to specify whether null elements should be considered as equal + * @param nans_equal Flag to specify whether NaN elements should be considered as equal + * @param mr Device memory resource used to allocate the returned table * @return Table with distinct rows in an unspecified order */ std::unique_ptr
distinct( @@ -280,6 +277,36 @@ std::unique_ptr
distinct( nan_equality nans_equal = nan_equality::ALL_EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Create a new table without duplicate rows, preserving input order. + * + * Given an `input` table_view, each row is copied to the output table to create a set of distinct + * rows. The input row order is preserved. If there are duplicate rows, which row is copied depends + * on the `keep` parameter. + * + * This API produces the same output rows as `cudf::distinct`, but with input order preserved. + * + * Note that when `keep` is `KEEP_ANY`, the choice of which duplicate row to keep is arbitrary, but + * the returned table will retain the input order. That is, if the key column contained `1, 2, 1` + * with another values column `3, 4, 5`, the result could contain values `3, 4` or `4, 5` but not + * `4, 3` or `5, 4`. + * + * @param input The input table + * @param keys Vector of indices indicating key columns in the `input` table + * @param keep Copy any, first, last, or none of the found duplicates + * @param nulls_equal Flag to specify whether null elements should be considered as equal + * @param nans_equal Flag to specify whether NaN elements should be considered as equal + * @param mr Device memory resource used to allocate the returned table + * @return Table with distinct rows, preserving input order + */ +std::unique_ptr
stable_distinct( + table_view const& input, + std::vector const& keys, + duplicate_keep_option keep = duplicate_keep_option::KEEP_ANY, + null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::ALL_EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Count the number of consecutive groups of equivalent rows in a column. * diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 40b70986eca..f8d8702d1ca 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -38,6 +38,8 @@ class file_sink : public data_sink { if (detail::cufile_integration::is_kvikio_enabled()) { _kvikio_file = kvikio::FileHandle(filepath, "w"); + CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode {}.", + _kvikio_file.is_compat_mode_on() ? "on" : "off"); } else { _cufile_out = detail::make_cufile_output(filepath); } diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index e2cea7a56ff..a457a803e3c 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -42,6 +42,8 @@ class file_source : public datasource { { if (detail::cufile_integration::is_kvikio_enabled()) { _kvikio_file = kvikio::FileHandle(filepath); + CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.", + _kvikio_file.is_compat_mode_on() ? "on" : "off"); } else { _cufile_in = detail::make_cufile_input(filepath); } diff --git a/cpp/src/stream_compaction/stable_distinct.cu b/cpp/src/stream_compaction/stable_distinct.cu index d45897930b0..45a2de9288b 100644 --- a/cpp/src/stream_compaction/stable_distinct.cu +++ b/cpp/src/stream_compaction/stable_distinct.cu @@ -19,12 +19,14 @@ #include #include #include +#include #include #include #include -namespace cudf::detail { +namespace cudf { +namespace detail { std::unique_ptr
stable_distinct(table_view const& input, std::vector const& keys, @@ -45,7 +47,13 @@ std::unique_ptr
stable_distinct(table_view const& input, stream, rmm::mr::get_current_device_resource()); - // Markers to denote which rows to be copied to the output. + // The only difference between this implementation and the unstable version + // is that the stable implementation must retain the input order. The + // distinct indices are not sorted, so we cannot simply copy the rows in the + // order of the distinct indices and retain the input order. Instead, we use + // a boolean mask to indicate which rows to copy to the output. This avoids + // the need to sort the distinct indices, which is slower. + auto const output_markers = [&] { auto markers = rmm::device_uvector(input.num_rows(), stream); thrust::uninitialized_fill(rmm::exec_policy(stream), markers.begin(), markers.end(), false); @@ -58,13 +66,22 @@ std::unique_ptr
stable_distinct(table_view const& input, return markers; }(); - return cudf::detail::copy_if( - input, - [output_markers = output_markers.begin()] __device__(auto const idx) { - return *(output_markers + idx); - }, - stream, - mr); + return cudf::detail::apply_boolean_mask( + input, cudf::device_span(output_markers), stream, mr); +} + +} // namespace detail + +std::unique_ptr
stable_distinct(table_view const& input, + std::vector const& keys, + duplicate_keep_option keep, + null_equality nulls_equal, + nan_equality nans_equal, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::stable_distinct( + input, keys, keep, nulls_equal, nans_equal, cudf::get_default_stream(), mr); } -} // namespace cudf::detail +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 7f2807fc30e..1262e065041 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -401,8 +401,9 @@ ConfigureTest( stream_compaction/apply_boolean_mask_tests.cpp stream_compaction/distinct_count_tests.cpp stream_compaction/distinct_tests.cpp - stream_compaction/drop_nulls_tests.cpp stream_compaction/drop_nans_tests.cpp + stream_compaction/drop_nulls_tests.cpp + stream_compaction/stable_distinct_tests.cpp stream_compaction/unique_count_tests.cpp stream_compaction/unique_tests.cpp ) diff --git a/cpp/tests/stream_compaction/distinct_tests.cpp b/cpp/tests/stream_compaction/distinct_tests.cpp index 85955ce7fc9..586792b4b30 100644 --- a/cpp/tests/stream_compaction/distinct_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_tests.cpp @@ -126,7 +126,7 @@ TEST_F(DistinctKeepAny, EmptyInputTable) { int32s_col col(std::initializer_list{}); cudf::table_view input{{col}}; - std::vector key_idx{1, 2}; + std::vector key_idx{0}; auto got = cudf::distinct(input, key_idx, KEEP_ANY); CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); @@ -1217,11 +1217,11 @@ TEST_F(DistinctKeepAny, StructsOfStructs) // 8 | { {2, 1}, 5} | auto s1 = [&] { - auto a = int32s_col{1, 1, XXX, XXX, 2, 1, 1, XXX, 2}; - auto b = int32s_col{1, 2, XXX, XXX, 2, 1, 1, XXX, 1}; + auto a = int32s_col{1, 1, XXX, XXX, XXX, XXX, 1, XXX, 2}; + auto b = int32s_col{1, 2, XXX, XXX, XXX, XXX, 1, XXX, 1}; auto s2 = structs_col{{a, b}, nulls_at({2, 3, 7})}; - auto c = int32s_col{5, 4, 6, 4, 3, 3, 5, 4, 5}; + auto c = int32s_col{5, 4, 6, 4, XXX, XXX, 5, 4, 5}; std::vector> s1_children; s1_children.emplace_back(s2.release()); s1_children.emplace_back(c.release()); @@ -1270,11 +1270,11 @@ TEST_F(DistinctKeepAny, SlicedStructsOfStructs) // 8 | { {2, 1}, 5} | auto s1 = [&] { - auto a = int32s_col{1, 1, 2, 2, 2, 1, 1, 1, 2}; - auto b = int32s_col{1, 2, 1, 2, 2, 1, 1, 1, 1}; + auto a = int32s_col{1, 1, XXX, XXX, XXX, XXX, 1, XXX, 2}; + auto b = int32s_col{1, 2, XXX, XXX, XXX, XXX, 1, XXX, 1}; auto s2 = structs_col{{a, b}, nulls_at({2, 3, 7})}; - auto c = int32s_col{5, 4, 6, 4, 3, 3, 5, 4, 5}; + auto c = int32s_col{5, 4, 6, 4, XXX, XXX, 5, 4, 5}; std::vector> s1_children; s1_children.emplace_back(s2.release()); s1_children.emplace_back(c.release()); diff --git a/cpp/tests/stream_compaction/stable_distinct_tests.cpp b/cpp/tests/stream_compaction/stable_distinct_tests.cpp new file mode 100644 index 00000000000..e28b96fc8be --- /dev/null +++ b/cpp/tests/stream_compaction/stable_distinct_tests.cpp @@ -0,0 +1,1354 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +auto constexpr null{0}; // null at current level +auto constexpr XXX{0}; // null pushed down from parent level +auto constexpr NaN = std::numeric_limits::quiet_NaN(); +auto constexpr KEEP_ANY = cudf::duplicate_keep_option::KEEP_ANY; +auto constexpr KEEP_FIRST = cudf::duplicate_keep_option::KEEP_FIRST; +auto constexpr KEEP_LAST = cudf::duplicate_keep_option::KEEP_LAST; +auto constexpr KEEP_NONE = cudf::duplicate_keep_option::KEEP_NONE; +auto constexpr NULL_EQUAL = cudf::null_equality::EQUAL; +auto constexpr NULL_UNEQUAL = cudf::null_equality::UNEQUAL; +auto constexpr NAN_EQUAL = cudf::nan_equality::ALL_EQUAL; +auto constexpr NAN_UNEQUAL = cudf::nan_equality::UNEQUAL; + +using int32s_col = cudf::test::fixed_width_column_wrapper; +using floats_col = cudf::test::fixed_width_column_wrapper; +using lists_col = cudf::test::lists_column_wrapper; +using strings_col = cudf::test::strings_column_wrapper; +using structs_col = cudf::test::structs_column_wrapper; + +using cudf::nan_policy; +using cudf::null_equality; +using cudf::null_policy; +using cudf::test::iterators::no_nulls; +using cudf::test::iterators::null_at; +using cudf::test::iterators::nulls_at; + +struct StableDistinctKeepAny : public cudf::test::BaseFixture {}; + +struct StableDistinctKeepFirstLastNone : public cudf::test::BaseFixture {}; + +TEST_F(StableDistinctKeepAny, StringKeyColumn) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + auto const col = int32s_col{{5, 5, null, null, 5, 8, 1}, nulls_at({2, 3})}; + auto const keys = + strings_col{{"all", "all", "new", "new", "" /*NULL*/, "the", "strings"}, null_at(4)}; + auto const input = cudf::table_view{{col, keys}}; + auto const key_idx = std::vector{1}; + + auto const exp_col = int32s_col{{5, null, 5, 8, 1}, null_at(1)}; + auto const exp_keys = strings_col{{"all", "new", "" /*NULL*/, "the", "strings"}, null_at(2)}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); +} + +TEST_F(StableDistinctKeepFirstLastNone, StringKeyColumn) +{ + // Column(s) used to test needs to have different rows for the same keys. + auto const col = int32s_col{{0, null, 2, 3, 4, 5, 6}, null_at(1)}; + auto const keys = + strings_col{{"all", "new", "new", "all", "" /*NULL*/, "the", "strings"}, null_at(4)}; + auto const input = cudf::table_view{{col, keys}}; + auto const key_idx = std::vector{1}; + + // KEEP_FIRST + { + auto const exp_col = int32s_col{{0, null, 4, 5, 6}, null_at(1)}; + auto const exp_keys = strings_col{{"all", "new", "" /*NULL*/, "the", "strings"}, null_at(2)}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_LAST + { + auto const exp_col = int32s_col{{2, 3, 4, 5, 6}, no_nulls()}; + auto const exp_keys = strings_col{{"new", "all", "" /*NULL*/, "the", "strings"}, null_at(2)}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_NONE + { + auto const exp_col = int32s_col{{4, 5, 6}, no_nulls()}; + auto const exp_keys = strings_col{{"" /*NULL*/, "the", "strings"}, null_at(0)}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepAny, EmptyInputTable) +{ + int32s_col col(std::initializer_list{}); + cudf::table_view input{{col}}; + std::vector key_idx{0}; + + auto got = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); +} + +TEST_F(StableDistinctKeepAny, NoColumnInputTable) +{ + cudf::table_view input{std::vector()}; + std::vector key_idx{1, 2}; + + auto got = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(input, got->view()); +} + +TEST_F(StableDistinctKeepAny, EmptyKeys) +{ + int32s_col col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}}; + int32s_col empty_col{}; + cudf::table_view input{{col}}; + std::vector key_idx{}; + + auto got = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(cudf::table_view{{empty_col}}, got->view()); +} + +TEST_F(StableDistinctKeepAny, NoNullsTable) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + auto const col1 = int32s_col{6, 6, 6, 3, 5, 8, 5}; + auto const col2 = floats_col{6, 6, 6, 3, 4, 9, 4}; + auto const keys1 = int32s_col{20, 20, 20, 20, 19, 21, 9}; + auto const keys2 = int32s_col{19, 19, 19, 20, 20, 9, 21}; + + auto const input = cudf::table_view{{col1, col2, keys1, keys2}}; + auto const key_idx = std::vector{2, 3}; + + auto const exp_col1 = int32s_col{6, 3, 5, 8, 5}; + auto const exp_col2 = floats_col{6, 3, 4, 9, 4}; + auto const exp_keys1 = int32s_col{20, 20, 19, 21, 9}; + auto const exp_keys2 = int32s_col{19, 20, 20, 9, 21}; + auto const expected = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); +} + +TEST_F(StableDistinctKeepAny, NoNullsTableWithNaNs) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + auto const col1 = int32s_col{6, 6, 6, 1, 1, 1, 3, 5, 8, 5}; + auto const col2 = floats_col{6, 6, 6, 1, 1, 1, 3, 4, 9, 4}; + auto const keys1 = int32s_col{20, 20, 20, 15, 15, 15, 20, 19, 21, 9}; + auto const keys2 = floats_col{19., 19., 19., NaN, NaN, NaN, 20., 20., 9., 21.}; + + auto const input = cudf::table_view{{col1, col2, keys1, keys2}}; + auto const key_idx = std::vector{2, 3}; + + // NaNs are unequal. + { + auto const exp_col1 = int32s_col{6, 1, 1, 1, 3, 5, 8, 5}; + auto const exp_col2 = floats_col{6, 1, 1, 1, 3, 4, 9, 4}; + auto const exp_keys1 = int32s_col{20, 15, 15, 15, 20, 19, 21, 9}; + auto const exp_keys2 = floats_col{19., NaN, NaN, NaN, 20., 20., 9., 21.}; + auto const expected = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // NaNs are equal. + { + auto const exp_col1 = int32s_col{6, 1, 3, 5, 8, 5}; + auto const exp_col2 = floats_col{6, 1, 3, 4, 9, 4}; + auto const exp_keys1 = int32s_col{20, 15, 20, 19, 21, 9}; + auto const exp_keys2 = floats_col{19., NaN, 20., 20., 9., 21.}; + auto const expected = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepFirstLastNone, NoNullsTable) +{ + // Column(s) used to test needs to have different rows for the same keys. + auto const col1 = int32s_col{0, 1, 2, 3, 4, 5, 6}; + auto const col2 = floats_col{10, 11, 12, 13, 14, 15, 16}; + auto const keys1 = int32s_col{20, 20, 20, 20, 19, 21, 9}; + auto const keys2 = int32s_col{19, 19, 19, 20, 20, 9, 21}; + + auto const input = cudf::table_view{{col1, col2, keys1, keys2}}; + auto const key_idx = std::vector{2, 3}; + + // KEEP_FIRST + { + auto const exp_col1 = int32s_col{0, 3, 4, 5, 6}; + auto const exp_col2 = floats_col{10, 13, 14, 15, 16}; + auto const exp_keys1 = int32s_col{20, 20, 19, 21, 9}; + auto const exp_keys2 = int32s_col{19, 20, 20, 9, 21}; + auto const expected = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_LAST + { + auto const exp_col1 = int32s_col{2, 3, 4, 5, 6}; + auto const exp_col2 = floats_col{12, 13, 14, 15, 16}; + auto const exp_keys1 = int32s_col{20, 20, 19, 21, 9}; + auto const exp_keys2 = int32s_col{19, 20, 20, 9, 21}; + auto const expected = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_NONE + { + auto const exp_col1 = int32s_col{3, 4, 5, 6}; + auto const exp_col2 = floats_col{13, 14, 15, 16}; + auto const exp_keys1 = int32s_col{20, 19, 21, 9}; + auto const exp_keys2 = int32s_col{20, 20, 9, 21}; + auto const expected = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepAny, SlicedNoNullsTable) +{ + auto constexpr dont_care = int32_t{0}; + + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + auto const col1 = int32s_col{dont_care, dont_care, 6, 6, 6, 3, 5, 8, 5, dont_care}; + auto const col2 = floats_col{dont_care, dont_care, 6, 6, 6, 3, 4, 9, 4, dont_care}; + auto const keys1 = int32s_col{dont_care, dont_care, 20, 20, 20, 20, 19, 21, 9, dont_care}; + auto const keys2 = int32s_col{dont_care, dont_care, 19, 19, 19, 20, 20, 9, 21, dont_care}; + + auto const input_original = cudf::table_view{{col1, col2, keys1, keys2}}; + auto const input = cudf::slice(input_original, {2, 9})[0]; + auto const key_idx = std::vector{2, 3}; + + auto const exp_col1 = int32s_col{6, 3, 5, 8, 5}; + auto const exp_col2 = floats_col{6, 3, 4, 9, 4}; + auto const exp_keys1 = int32s_col{20, 20, 19, 21, 9}; + auto const exp_keys2 = int32s_col{19, 20, 20, 9, 21}; + auto const expected = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); +} + +TEST_F(StableDistinctKeepFirstLastNone, SlicedNoNullsTable) +{ + auto constexpr dont_care = int32_t{0}; + + // Column(s) used to test needs to have different rows for the same keys. + // clang-format off + auto const col1 = int32s_col{0, 1, 2, // <- don't care + 3, 4, 5, 6, 7, 8, 9, dont_care}; + auto const col2 = floats_col{10, 11, 12, // <- don't care + 13, 14, 15, 16, 17, 18, 19, dont_care}; + auto const keys1 = int32s_col{20, 20, 20, // <- don't care + 20, 20, 20, 20, 19, 21, 9, dont_care}; + auto const keys2 = int32s_col{19, 19, 19, // <- don't care + 19, 19, 19, 20, 20, 9, 21, dont_care}; + // clang-format on + auto const input_original = cudf::table_view{{col1, col2, keys1, keys2}}; + auto const input = cudf::slice(input_original, {3, 10})[0]; + auto const key_idx = std::vector{2, 3}; + + // KEEP_FIRST + { + auto const exp_col1 = int32s_col{3, 6, 7, 8, 9}; + auto const exp_col2 = floats_col{13, 16, 17, 18, 19}; + auto const exp_keys1 = int32s_col{20, 20, 19, 21, 9}; + auto const exp_keys2 = int32s_col{19, 20, 20, 9, 21}; + auto const expected = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_LAST + { + auto const exp_col1 = int32s_col{5, 6, 7, 8, 9}; + auto const exp_col2 = floats_col{15, 16, 17, 18, 19}; + auto const exp_keys1 = int32s_col{20, 20, 19, 21, 9}; + auto const exp_keys2 = int32s_col{19, 20, 20, 9, 21}; + auto const expected = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_NONE + { + auto const exp_col1 = int32s_col{6, 7, 8, 9}; + auto const exp_col2 = floats_col{16, 17, 18, 19}; + auto const exp_keys1 = int32s_col{20, 19, 21, 9}; + auto const exp_keys2 = int32s_col{20, 20, 9, 21}; + auto const expected = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepAny, InputWithNulls) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + auto const col = int32s_col{5, 4, 4, 1, 1, 8}; + auto const keys = int32s_col{{20, null, null, 19, 19, 21}, nulls_at({1, 2})}; + auto const input = cudf::table_view{{col, keys}}; + auto const key_idx = std::vector{1}; + + // Nulls are equal. + { + auto const exp_col = int32s_col{5, 4, 1, 8}; + auto const exp_keys = int32s_col{{20, null, 19, 21}, null_at(1)}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // Nulls are unequal. + { + auto const exp_col = int32s_col{5, 4, 4, 1, 8}; + auto const exp_keys = int32s_col{{20, null, null, 19, 21}, nulls_at({1, 2})}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepAny, InputWithNullsAndNaNs) +{ + auto constexpr null{0.0}; // shadow the global `null` variable of type int + + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + auto const col = int32s_col{5, 4, 4, 1, 1, 1, 8, 8, 1}; + auto const keys = floats_col{{20., null, null, NaN, NaN, NaN, 19., 19., 21.}, nulls_at({1, 2})}; + auto const input = cudf::table_view{{col, keys}}; + auto const key_idx = std::vector{1}; + + // Nulls are equal, NaNs are unequal. + { + auto const exp_col = int32s_col{5, 4, 1, 1, 1, 8, 1}; + auto const exp_keys = floats_col{{20., null, NaN, NaN, NaN, 19., 21.}, null_at(1)}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // Nulls are equal, NaNs are equal. + { + auto const exp_col = int32s_col{5, 4, 1, 8, 1}; + auto const exp_keys = floats_col{{20., null, NaN, 19., 21.}, null_at(1)}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // Nulls are unequal, NaNs are unequal. + { + auto const exp_col = int32s_col{5, 4, 4, 1, 1, 1, 8, 1}; + auto const exp_keys = floats_col{{20., null, null, NaN, NaN, NaN, 19., 21.}, nulls_at({1, 2})}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL, NAN_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // Nulls are unequal, NaNs are equal. + { + auto const exp_col = int32s_col{5, 4, 4, 1, 8, 1}; + auto const exp_keys = floats_col{{20., null, null, NaN, 19., 21.}, nulls_at({1, 2})}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL, NAN_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepFirstLastNone, InputWithNullsEqual) +{ + // Column(s) used to test needs to have different rows for the same keys. + auto const col = int32s_col{0, 1, 2, 3, 4, 5, 6}; + auto const keys = int32s_col{{20, null, null, 19, 21, 19, 22}, nulls_at({1, 2})}; + auto const input = cudf::table_view{{col, keys}}; + auto const key_idx = std::vector{1}; + + // KEEP_FIRST + { + auto const exp_col = int32s_col{0, 1, 3, 4, 6}; + auto const exp_keys = int32s_col{{20, null, 19, 21, 22}, null_at(1)}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST, NULL_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_LAST + { + auto const exp_col = int32s_col{0, 2, 4, 5, 6}; + auto const exp_keys = int32s_col{{20, null, 21, 19, 22}, null_at(1)}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST, NULL_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_NONE + { + auto const exp_col = int32s_col{0, 4, 6}; + auto const exp_keys = int32s_col{{20, 21, 22}, no_nulls()}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE, NULL_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepFirstLastNone, InputWithNullsUnequal) +{ + // Column(s) used to test needs to have different rows for the same keys. + auto const col = int32s_col{0, 1, 2, 3, 4, 5, 6, 7}; + auto const keys = int32s_col{{20, null, null, 19, 21, 19, 22, 20}, nulls_at({1, 2})}; + auto const input = cudf::table_view{{col, keys}}; + auto const key_idx = std::vector{1}; + + // KEEP_FIRST + { + auto const exp_col = int32s_col{0, 1, 2, 3, 4, 6}; + auto const exp_keys = int32s_col{{20, null, null, 19, 21, 22}, nulls_at({1, 2})}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_LAST + { + auto const exp_col = int32s_col{1, 2, 4, 5, 6, 7}; + auto const exp_keys = int32s_col{{null, null, 21, 19, 22, 20}, nulls_at({0, 1})}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_NONE + { + auto const exp_col = int32s_col{1, 2, 4, 6}; + auto const exp_keys = int32s_col{{null, null, 21, 22}, nulls_at({0, 1})}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsEqual) +{ + // Column(s) used to test needs to have different rows for the same keys. + auto const col = int32s_col{0, 1, 2, 3, 4, 5, 6}; + auto const keys = floats_col{20., NaN, NaN, 19., 21., 19., 22.}; + auto const input = cudf::table_view{{col, keys}}; + auto const key_idx = std::vector{1}; + + // KEEP_FIRST + { + auto const exp_col = int32s_col{0, 1, 3, 4, 6}; + auto const exp_keys = floats_col{20., NaN, 19., 21., 22.}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST, NULL_EQUAL, NAN_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_LAST + { + auto const exp_col = int32s_col{0, 2, 4, 5, 6}; + auto const exp_keys = floats_col{20., NaN, 21., 19., 22.}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST, NULL_EQUAL, NAN_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_NONE + { + auto const exp_col = int32s_col{0, 4, 6}; + auto const exp_keys = floats_col{20., 21., 22.}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE, NULL_EQUAL, NAN_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsUnequal) +{ + // Column(s) used to test needs to have different rows for the same keys. + auto const col = int32s_col{0, 1, 2, 3, 4, 5, 6, 7}; + auto const keys = floats_col{20., NaN, NaN, 19., 21., 19., 22., 20.}; + auto const input = cudf::table_view{{col, keys}}; + auto const key_idx = std::vector{1}; + + // KEEP_FIRST + { + auto const exp_col = int32s_col{0, 1, 2, 3, 4, 6}; + auto const exp_keys = floats_col{20., NaN, NaN, 19., 21., 22.}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = + cudf::stable_distinct(input, key_idx, KEEP_FIRST, NULL_UNEQUAL, NAN_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_LAST + { + auto const exp_col = int32s_col{1, 2, 4, 5, 6, 7}; + auto const exp_keys = floats_col{NaN, NaN, 21., 19., 22., 20.}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST, NULL_UNEQUAL, NAN_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_NONE + { + auto const exp_col = int32s_col{1, 2, 4, 6}; + auto const exp_keys = floats_col{NaN, NaN, 21., 22.}; + auto const expected = cudf::table_view{{exp_col, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE, NULL_UNEQUAL, NAN_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepAny, BasicLists) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + // clang-format off + auto const idx = int32s_col{ 0, 0, 1, 1, 2, 3, 4, 4, 4, 5, 5, 6}; + auto const keys = lists_col{{}, {}, {1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}}; + // clang-format on + auto const input = cudf::table_view{{idx, keys}}; + auto const key_idx = std::vector{1}; + + auto const exp_idx = int32s_col{0, 1, 2, 3, 4, 5, 6}; + auto const exp_keys = lists_col{{}, {1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); +} + +TEST_F(StableDistinctKeepFirstLastNone, BasicLists) +{ + // Column(s) used to test needs to have different rows for the same keys. + // clang-format off + auto const idx = int32s_col{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + auto const keys = lists_col{{}, {}, {1}, {1, 1}, {1}, {1, 2}, {2, 2}, {2}, {2}, {2, 1}, {2, 2}, {2, 2}}; + // clang-format on + auto const input = cudf::table_view{{idx, keys}}; + auto const key_idx = std::vector{1}; + + // KEEP_FIRST + { + auto const exp_idx = int32s_col{0, 2, 3, 5, 6, 7, 9}; + auto const exp_keys = lists_col{{}, {1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_LAST + { + auto const exp_idx = int32s_col{1, 3, 4, 5, 8, 9, 11}; + auto const exp_keys = lists_col{{}, {1, 1}, {1}, {1, 2}, {2}, {2, 1}, {2, 2}}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_NONE + { + auto const exp_idx = int32s_col{3, 5, 9}; + auto const exp_keys = lists_col{{1, 1}, {1, 2}, {2, 1}}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepAny, SlicedBasicLists) +{ + auto constexpr dont_care = int32_t{0}; + + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + auto const idx = int32s_col{dont_care, dont_care, 1, 1, 2, 3, 4, 4, 4, 5, 5, 6, dont_care}; + auto const keys = lists_col{ + {0, 0}, {0, 0}, {1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}, {5, 5}}; + auto const input_original = cudf::table_view{{idx, keys}}; + auto const input = cudf::slice(input_original, {2, 12})[0]; + auto const key_idx = std::vector{1}; + + auto const exp_idx = int32s_col{1, 2, 3, 4, 5, 6}; + auto const exp_val = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}}; + auto const expected = cudf::table_view{{exp_idx, exp_val}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); +} + +TEST_F(StableDistinctKeepAny, NullableLists) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + auto const idx = int32s_col{0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 4}; + auto const keys = + lists_col{{{}, {}, {1}, {1}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {} /*NULL*/, {} /*NULL*/}, + nulls_at({9, 10})}; + auto const input = cudf::table_view{{idx, keys}}; + auto const key_idx = std::vector{1}; + + // Nulls are equal. + { + auto const exp_idx = int32s_col{0, 1, 2, 3, 4}; + auto const exp_keys = lists_col{{{}, {1}, {2, 2}, {2}, {} /*NULL*/}, null_at(4)}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // Nulls are unequal. + { + auto const exp_idx = int32s_col{0, 1, 2, 3, 4, 4}; + auto const exp_keys = + lists_col{{{}, {1}, {2, 2}, {2}, {} /*NULL*/, {} /*NULL*/}, nulls_at({4, 5})}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepFirstLastNone, ListsWithNullsEqual) +{ + // Column(s) used to test needs to have different rows for the same keys. + auto const idx = int32s_col{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + auto const keys = + lists_col{{{}, {}, {1}, {1}, {2, 2}, {2}, {2}, {} /*NULL*/, {2, 2}, {2, 2}, {} /*NULL*/}, + nulls_at({7, 10})}; + auto const input = cudf::table_view{{idx, keys}}; + auto const key_idx = std::vector{1}; + + // KEEP_FIRST + { + auto const exp_idx = int32s_col{0, 2, 4, 5, 7}; + auto const exp_keys = lists_col{{{}, {1}, {2, 2}, {2}, {} /*NULL*/}, null_at(4)}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST, NULL_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_LAST + { + auto const exp_idx = int32s_col{1, 3, 6, 9, 10}; + auto const exp_keys = lists_col{{{}, {1}, {2}, {2, 2}, {} /*NULL*/}, null_at(4)}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST, NULL_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_NONE + { + auto const exp_idx = int32s_col{}; + auto const exp_keys = lists_col{}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE, NULL_EQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepFirstLastNone, ListsWithNullsUnequal) +{ + // Column(s) used to test needs to have different rows for the same keys. + auto const idx = int32s_col{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + auto const keys = + lists_col{{{}, {}, {1}, {1}, {2, 2}, {2}, {2}, {} /*NULL*/, {2, 2}, {2, 2}, {} /*NULL*/}, + nulls_at({7, 10})}; + auto const input = cudf::table_view{{idx, keys}}; + auto const key_idx = std::vector{1}; + + // KEEP_FIRST + { + auto const exp_idx = int32s_col{0, 2, 4, 5, 7, 10}; + auto const exp_keys = + lists_col{{{}, {1}, {2, 2}, {2}, {} /*NULL*/, {} /*NULL*/}, nulls_at({4, 5})}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_LAST + { + auto const exp_idx = int32s_col{1, 3, 6, 7, 9, 10}; + auto const exp_keys = + lists_col{{{}, {1}, {2}, {} /*NULL*/, {2, 2}, {} /*NULL*/}, nulls_at({3, 5})}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } + + // KEEP_NONE + { + auto const exp_idx = int32s_col{7, 10}; + auto const exp_keys = lists_col{{lists_col{} /*NULL*/, lists_col{} /*NULL*/}, nulls_at({0, 1})}; + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); + } +} + +TEST_F(StableDistinctKeepAny, ListsOfStructs) +{ + // Constructing a list of structs of two elements + // 0. [] == + // 1. [] != + // 2. Null == + // 3. Null != + // 4. [Null, Null] != + // 5. [Null] == + // 6. [Null] == + // 7. [Null] != + // 8. [{Null, Null}] != + // 9. [{1,'a'}, {2,'b'}] != + // 10. [{0,'a'}, {2,'b'}] != + // 11. [{0,'a'}, {2,'c'}] == + // 12. [{0,'a'}, {2,'c'}] != + // 13. [{0,Null}] == + // 14. [{0,Null}] != + // 15. [{Null, 'b'}] == + // 16. [{Null, 'b'}] + + auto const structs = [] { + auto child1 = + int32s_col{{XXX, XXX, XXX, XXX, XXX, null, 1, 2, 0, 2, 0, 2, 0, 2, 0, 0, null, null}, + nulls_at({5, 16, 17})}; + auto child2 = strings_col{{"" /*XXX*/, + "" /*XXX*/, + "" /*XXX*/, + "" /*XXX*/, + "" /*XXX*/, + "" /*null*/, + "a", + "b", + "a", + "b", + "a", + "c", + "a", + "c", + "" /*null*/, + "" /*null*/, + "b", + "b"}, + nulls_at({5, 14, 15})}; + + return structs_col{{child1, child2}, nulls_at({0, 1, 2, 3, 4})}; + }(); + + auto const offsets = int32s_col{0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 8, 10, 12, 14, 15, 16, 17, 18}; + auto const null_it = nulls_at({2, 3}); + + auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_it, null_it + 17); + + auto const keys = cudf::column_view(cudf::data_type(cudf::type_id::LIST), + 17, + nullptr, + static_cast(null_mask.data()), + null_count, + 0, + {offsets, structs}); + + auto const idx = int32s_col{1, 1, 2, 2, 3, 4, 4, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10}; + auto const input = cudf::table_view{{idx, keys}}; + auto const key_idx = std::vector{1}; + + // Nulls are equal. + { + auto const expect_map = int32s_col{0, 2, 4, 5, 8, 9, 10, 11, 13, 15}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } + + // Nulls are unequal. + { + auto const expect_map = int32s_col{0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } +} + +TEST_F(StableDistinctKeepFirstLastNone, ListsOfStructs) +{ + // Constructing a list of structs of two elements + // 0. [] == + // 1. [] != + // 2. Null == + // 3. Null != + // 4. [Null, Null] != + // 5. [Null] == + // 6. [Null] == + // 7. [Null] != + // 8. [{Null, Null}] != + // 9. [{1,'a'}, {2,'b'}] != + // 10. [{0,'a'}, {2,'b'}] != + // 11. [{0,'a'}, {2,'c'}] == + // 12. [{0,'a'}, {2,'c'}] != + // 13. [{0,Null}] == + // 14. [{0,Null}] != + // 15. [{Null, 'b'}] == + // 16. [{Null, 'b'}] + + auto const structs = [] { + auto child1 = + int32s_col{{XXX, XXX, XXX, XXX, XXX, null, 1, 2, 0, 2, 0, 2, 0, 2, 0, 0, null, null}, + nulls_at({5, 16, 17})}; + auto child2 = strings_col{{"" /*XXX*/, + "" /*XXX*/, + "" /*XXX*/, + "" /*XXX*/, + "" /*XXX*/, + "" /*null*/, + "a", + "b", + "a", + "b", + "a", + "c", + "a", + "c", + "" /*null*/, + "" /*null*/, + "b", + "b"}, + nulls_at({5, 14, 15})}; + + return structs_col{{child1, child2}, nulls_at({0, 1, 2, 3, 4})}; + }(); + + auto const offsets = int32s_col{0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 8, 10, 12, 14, 15, 16, 17, 18}; + auto const null_it = nulls_at({2, 3}); + + auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_it, null_it + 17); + + auto const keys = cudf::column_view(cudf::data_type(cudf::type_id::LIST), + 17, + nullptr, + static_cast(null_mask.data()), + null_count, + 0, + {offsets, structs}); + + auto const idx = int32s_col{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; + auto const input = cudf::table_view{{idx, keys}}; + auto const key_idx = std::vector{1}; + + // KEEP_FIRST + { + auto const expect_map = int32s_col{0, 2, 4, 5, 8, 9, 10, 11, 13, 15}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } + + // KEEP_LAST + { + auto const expect_map = int32s_col{1, 3, 4, 7, 8, 9, 10, 12, 14, 16}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } + + // KEEP_NONE + { + auto const expect_map = int32s_col{4, 8, 9, 10}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } +} + +TEST_F(StableDistinctKeepAny, SlicedListsOfStructs) +{ + // Constructing a list of struct of two elements + // 0. [] == <- Don't care + // 1. [] != <- Don't care + // 2. Null == <- Don't care + // 3. Null != <- Don't care + // 4. [Null, Null] != <- Don't care + // 5. [Null] == <- Don't care + // 6. [Null] == <- Don't care + // 7. [Null] != <- Don't care + // 8. [{Null, Null}] != + // 9. [{1,'a'}, {2,'b'}] != + // 10. [{0,'a'}, {2,'b'}] != + // 11. [{0,'a'}, {2,'c'}] == + // 12. [{0,'a'}, {2,'c'}] != + // 13. [{0,Null}] == + // 14. [{0,Null}] != + // 15. [{Null, 'b'}] == <- Don't care + // 16. [{Null, 'b'}] <- Don't care + + auto const structs = [] { + auto child1 = + int32s_col{{XXX, XXX, XXX, XXX, XXX, null, 1, 2, 0, 2, 0, 2, 0, 2, 0, 0, null, null}, + nulls_at({5, 16, 17})}; + auto child2 = strings_col{{"" /*XXX*/, + "" /*XXX*/, + "" /*XXX*/, + "" /*XXX*/, + "" /*XXX*/, + "" /*null*/, + "a", + "b", + "a", + "b", + "a", + "c", + "a", + "c", + "" /*null*/, + "" /*null*/, + "b", + "b"}, + nulls_at({5, 14, 15})}; + + return structs_col{{child1, child2}, nulls_at({0, 1, 2, 3, 4})}; + }(); + + auto const offsets = int32s_col{0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 8, 10, 12, 14, 15, 16, 17, 18}; + auto const null_it = nulls_at({2, 3}); + + auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_it, null_it + 17); + + auto const keys = cudf::column_view(cudf::data_type(cudf::type_id::LIST), + 17, + nullptr, + static_cast(null_mask.data()), + null_count, + 0, + {offsets, structs}); + + auto const idx = int32s_col{1, 1, 2, 2, 3, 4, 4, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10}; + auto const input_original = cudf::table_view{{idx, keys}}; + auto const input = cudf::slice(input_original, {8, 15})[0]; + auto const key_idx = std::vector{1}; + + // Nulls are equal. + { + auto const expect_map = int32s_col{8, 9, 10, 11, 13}; + auto const expect_table = cudf::gather(input_original, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*expect_table, *result); + } + + // Nulls are unequal. + { + auto const expect_map = int32s_col{8, 9, 10, 11, 13, 14}; + auto const expect_table = cudf::gather(input_original, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*expect_table, *result); + } +} + +TEST_F(StableDistinctKeepAny, ListsOfEmptyStructs) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + + // 0. [] == + // 1. [] != + // 2. Null == + // 3. Null != + // 4. [Null, Null] == + // 5. [Null, Null] == + // 6. [Null, Null] != + // 7. [Null] == + // 8. [Null] != + // 9. [{}] == + // 10. [{}] != + // 11. [{}, {}] == + // 12. [{}, {}] + + auto const structs_null_it = nulls_at({0, 1, 2, 3, 4, 5, 6, 7}); + auto [structs_null_mask, structs_null_count] = + cudf::test::detail::make_null_mask(structs_null_it, structs_null_it + 14); + auto const structs = + cudf::column_view(cudf::data_type(cudf::type_id::STRUCT), + 14, + nullptr, + static_cast(structs_null_mask.data()), + structs_null_count); + + auto const offsets = int32s_col{0, 0, 0, 0, 0, 2, 4, 6, 7, 8, 9, 10, 12, 14}; + auto const lists_null_it = nulls_at({2, 3}); + auto [lists_null_mask, lists_null_count] = + cudf::test::detail::make_null_mask(lists_null_it, lists_null_it + 13); + auto const keys = + cudf::column_view(cudf::data_type(cudf::type_id::LIST), + 13, + nullptr, + static_cast(lists_null_mask.data()), + lists_null_count, + 0, + {offsets, structs}); + + auto const idx = int32s_col{1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6}; + auto const input = cudf::table_view{{idx, keys}}; + auto const key_idx = std::vector{1}; + + // Nulls are equal. + { + auto const expect_map = int32s_col{0, 2, 4, 7, 9, 11}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } + + // Nulls are unequal. + { + auto const expect_map = int32s_col{0, 2, 3, 4, 5, 6, 7, 8, 9, 11}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } +} + +TEST_F(StableDistinctKeepAny, EmptyDeepList) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + + // List>, where all lists are empty: + // + // 0. [] + // 1. [] + // 2. Null + // 3. Null + + auto const keys = + lists_col{{lists_col{}, lists_col{}, lists_col{}, lists_col{}}, nulls_at({2, 3})}; + + auto const idx = int32s_col{1, 1, 2, 2}; + auto const input = cudf::table_view{{idx, keys}}; + auto const key_idx = std::vector{1}; + + // Nulls are equal. + { + auto const expect_map = int32s_col{0, 2}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } + + // Nulls are unequal. + { + auto const expect_map = int32s_col{0, 2, 3}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } +} + +TEST_F(StableDistinctKeepAny, StructsOfStructs) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + + // +-----------------+ + // | s1{s2{a,b}, c} | + // +-----------------+ + // 0 | { {1, 1}, 5} | + // 1 | { {1, 1}, 5} | // Same as 0 + // 2 | { {1, 2}, 4} | + // 3 | { Null, 6} | + // 4 | { Null, 4} | + // 5 | { Null, 4} | // Same as 4 + // 6 | Null | + // 7 | Null | // Same as 6 + // 8 | { {2, 1}, 5} | + + auto s1 = [&] { + auto a = int32s_col{1, 1, 1, XXX, XXX, XXX, XXX, XXX, 2}; + auto b = int32s_col{1, 1, 2, XXX, XXX, XXX, XXX, XXX, 1}; + auto s2 = structs_col{{a, b}, nulls_at({3, 4, 5})}; + + auto c = int32s_col{5, 5, 4, 6, 4, 4, XXX, XXX, 5}; + std::vector> s1_children; + s1_children.emplace_back(s2.release()); + s1_children.emplace_back(c.release()); + auto const null_it = nulls_at({6, 7}); + return structs_col(std::move(s1_children), std::vector{null_it, null_it + 9}); + }(); + + auto const idx = int32s_col{0, 0, 2, 3, 4, 4, 6, 6, 8}; + auto const input = cudf::table_view{{idx, s1}}; + auto const key_idx = std::vector{1}; + + // Nulls are equal. + { + auto const expect_map = int32s_col{0, 2, 3, 4, 6, 8}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } + + // Nulls are unequal. + { + auto const expect_map = int32s_col{0, 2, 3, 4, 4, 6, 6, 8}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } +} + +TEST_F(StableDistinctKeepAny, SlicedStructsOfStructs) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + + // +-----------------+ + // | s1{s2{a,b}, c} | + // +-----------------+ + // 0 | { {1, 1}, 5} | + // 1 | { {1, 1}, 5} | // Same as 0 + // 2 | { {1, 2}, 4} | + // 3 | { Null, 6} | + // 4 | { Null, 4} | + // 5 | { Null, 4} | // Same as 4 + // 6 | Null | + // 7 | Null | // Same as 6 + // 8 | { {2, 1}, 5} | + + auto s1 = [&] { + auto a = int32s_col{1, 1, XXX, XXX, XXX, XXX, 1, XXX, 2}; + auto b = int32s_col{1, 2, XXX, XXX, XXX, XXX, 1, XXX, 1}; + auto s2 = structs_col{{a, b}, nulls_at({3, 4, 5})}; + + auto c = int32s_col{5, 4, 6, 4, XXX, XXX, 5, 4, 5}; + std::vector> s1_children; + s1_children.emplace_back(s2.release()); + s1_children.emplace_back(c.release()); + auto const null_it = nulls_at({6, 7}); + return structs_col(std::move(s1_children), std::vector{null_it, null_it + 9}); + }(); + + auto const idx = int32s_col{0, 0, 2, 3, 4, 4, 6, 6, 8}; + auto const input_original = cudf::table_view{{idx, s1}}; + auto const input = cudf::slice(input_original, {1, 7})[0]; + auto const key_idx = std::vector{1}; + + // Nulls are equal. + { + auto const expect_map = int32s_col{1, 2, 3, 4, 6}; + auto const expect_table = cudf::gather(input_original, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } + + // Nulls are unequal. + { + auto const expect_map = int32s_col{1, 2, 3, 4, 4, 6}; + auto const expect_table = cudf::gather(input_original, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY, NULL_UNEQUAL); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } +} + +TEST_F(StableDistinctKeepAny, StructsOfLists) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + + auto const idx = int32s_col{1, 1, 2, 3, 4, 4, 4, 5, 5, 6}; + auto const keys = [] { + // All child columns are identical. + auto child1 = lists_col{{1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}}; + auto child2 = lists_col{{1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}}; + auto child3 = lists_col{{1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}}; + return structs_col{{child1, child2, child3}}; + }(); + + auto const input = cudf::table_view{{idx, keys}}; + auto const key_idx = std::vector{1}; + + auto const exp_idx = int32s_col{1, 2, 3, 4, 5, 6}; + auto const exp_keys = [] { + auto child1 = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}}; + auto child2 = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}}; + auto child3 = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}}; + return structs_col{{child1, child2, child3}}; + }(); + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); +} + +TEST_F(StableDistinctKeepFirstLastNone, StructsOfLists) +{ + auto const idx = int32s_col{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + auto const keys = [] { + // All child columns are identical. + auto child1 = lists_col{{1}, {1, 1}, {1}, {1, 2}, {2, 2}, {2}, {2}, {2, 1}, {2, 2}, {2, 2}}; + auto child2 = lists_col{{1}, {1, 1}, {1}, {1, 2}, {2, 2}, {2}, {2}, {2, 1}, {2, 2}, {2, 2}}; + auto child3 = lists_col{{1}, {1, 1}, {1}, {1, 2}, {2, 2}, {2}, {2}, {2, 1}, {2, 2}, {2, 2}}; + return structs_col{{child1, child2, child3}}; + }(); + + auto const input = cudf::table_view{{idx, keys}}; + auto const key_idx = std::vector{1}; + + // KEEP_FIRST + { + auto const expect_map = int32s_col{0, 1, 3, 4, 5, 7}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_FIRST); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } + + // KEEP_LAST + { + auto const expect_map = int32s_col{1, 2, 3, 6, 7, 9}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_LAST); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } + + // KEEP_NONE + { + auto const expect_map = int32s_col{1, 3, 7}; + auto const expect_table = cudf::gather(input, expect_map); + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_NONE); + CUDF_TEST_EXPECT_TABLES_EQUAL(*expect_table, *result); + } +} + +TEST_F(StableDistinctKeepAny, SlicedStructsOfLists) +{ + // Column(s) used to test KEEP_ANY needs to have same rows in contiguous + // groups for equivalent keys because KEEP_ANY is nondeterministic. + + auto constexpr dont_care = int32_t{0}; + + auto const idx = int32s_col{dont_care, dont_care, 1, 1, 2, 3, 4, 4, 4, 5, 5, 6, dont_care}; + auto const keys = [] { + // All child columns are identical. + auto child1 = lists_col{ + {0, 0}, {0, 0}, {1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}, {5, 5}}; + auto child2 = lists_col{ + {0, 0}, {0, 0}, {1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}, {5, 5}}; + auto child3 = lists_col{ + {0, 0}, {0, 0}, {1}, {1}, {1, 1}, {1, 2}, {2, 2}, {2, 2}, {2, 2}, {2}, {2}, {2, 1}, {5, 5}}; + return structs_col{{child1, child2, child3}}; + }(); + + auto const input_original = cudf::table_view{{idx, keys}}; + auto const input = cudf::slice(input_original, {2, 12})[0]; + auto const key_idx = std::vector{1}; + + auto const exp_idx = int32s_col{1, 2, 3, 4, 5, 6}; + auto const exp_keys = [] { + auto child1 = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}}; + auto child2 = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}}; + auto child3 = lists_col{{1}, {1, 1}, {1, 2}, {2, 2}, {2}, {2, 1}}; + return structs_col{{child1, child2, child3}}; + }(); + auto const expected = cudf::table_view{{exp_idx, exp_keys}}; + + auto const result = cudf::stable_distinct(input, key_idx, KEEP_ANY); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result); +} diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index 4fb20618737..e4da1c2d367 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -50,10 +50,11 @@ public interface EventHandler { * * @note the callback is invoked with this `ColumnVector`'s lock held. * + * @param cv - a reference to the ColumnVector we are closing * @param refCount - the updated ref count for this ColumnVector at the time * of invocation */ - void onClosed(int refCount); + void onClosed(ColumnVector cv, int refCount); } private static final Logger log = LoggerFactory.getLogger(ColumnVector.class); @@ -260,7 +261,7 @@ public synchronized void close() { refCount--; offHeap.delRef(); if (eventHandler != null) { - eventHandler.onClosed(refCount); + eventHandler.onClosed(this, refCount); } if (refCount == 0) { offHeap.clean(false); diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 20fae236d5b..93003828f34 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -6730,7 +6730,10 @@ void testColumnViewWithNonEmptyNullsIsCleared() { public void testEventHandlerIsCalledForEachClose() { final AtomicInteger onClosedWasCalled = new AtomicInteger(0); try (ColumnVector cv = ColumnVector.fromInts(1,2,3,4)) { - cv.setEventHandler(refCount -> onClosedWasCalled.incrementAndGet()); + cv.setEventHandler((col, refCount) -> { + assertEquals(cv, col); + onClosedWasCalled.incrementAndGet(); + }); } assertEquals(1, onClosedWasCalled.get()); } @@ -6744,7 +6747,9 @@ public void testEventHandlerIsNotCalledIfNotSet() { assertEquals(0, onClosedWasCalled.get()); try (ColumnVector cv = ColumnVector.fromInts(1,2,3,4)) { - cv.setEventHandler(refCount -> onClosedWasCalled.incrementAndGet()); + cv.setEventHandler((col, refCount) -> { + onClosedWasCalled.incrementAndGet(); + }); cv.setEventHandler(null); } assertEquals(0, onClosedWasCalled.get()); diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd index 61efd040807..bba2d1ffb7c 100644 --- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/cpp/stream_compaction.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -20,6 +20,7 @@ from cudf._lib.cpp.types cimport ( cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" \ nogil: ctypedef enum duplicate_keep_option: + KEEP_ANY 'cudf::duplicate_keep_option::KEEP_ANY' KEEP_FIRST 'cudf::duplicate_keep_option::KEEP_FIRST' KEEP_LAST 'cudf::duplicate_keep_option::KEEP_LAST' KEEP_NONE 'cudf::duplicate_keep_option::KEEP_NONE' @@ -33,13 +34,14 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" \ column_view boolean_mask ) except + - cdef unique_ptr[table] unique( - table_view source_table, - vector[size_type] keys, - duplicate_keep_option keep, - null_equality nulls_equal) except + - cdef size_type distinct_count( column_view source_table, null_policy null_handling, nan_policy nan_handling) except + + + cdef unique_ptr[table] stable_distinct( + table_view input, + vector[size_type] keys, + duplicate_keep_option keep, + null_equality nulls_equal, + ) except + diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 143999e52ef..4422ad83885 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from cudf.core.buffer import acquire_spill_lock @@ -9,22 +9,19 @@ from libcpp.vector cimport vector from cudf._lib.column cimport Column from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.sorting cimport stable_sort_by_key as cpp_stable_sort_by_key from cudf._lib.cpp.stream_compaction cimport ( apply_boolean_mask as cpp_apply_boolean_mask, distinct_count as cpp_distinct_count, drop_nulls as cpp_drop_nulls, duplicate_keep_option, - unique as cpp_unique, + stable_distinct as cpp_stable_distinct, ) from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport ( nan_policy, null_equality, - null_order, null_policy, - order, size_type, ) from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns @@ -145,41 +142,13 @@ def drop_duplicates(list columns, if nulls_are_equal else null_equality.UNEQUAL ) - - cdef vector[order] column_order = ( - vector[order]( - cpp_keys.size(), - order.ASCENDING - ) - ) - cdef vector[null_order] null_precedence = ( - vector[null_order]( - cpp_keys.size(), - null_order.BEFORE - ) - ) - cdef table_view source_table_view = table_view_from_columns(columns) - cdef table_view keys_view = source_table_view.select(cpp_keys) - cdef unique_ptr[table] sorted_source_table cdef unique_ptr[table] c_result with nogil: - # cudf::unique keeps unique rows in each consecutive group of - # equivalent rows. To match the behavior of pandas.DataFrame. - # drop_duplicates, users need to stable sort the input first - # and then invoke cudf::unique. - sorted_source_table = move( - cpp_stable_sort_by_key( - source_table_view, - keys_view, - column_order, - null_precedence - ) - ) c_result = move( - cpp_unique( - sorted_source_table.get().view(), + cpp_stable_distinct( + source_table_view, cpp_keys, cpp_keep_option, cpp_nulls_equal diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index a2e3bc44f3a..1fe30179001 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1476,7 +1476,12 @@ def __array_function__(self, func, types, args, kwargs): if cudf_func is func: return NotImplemented else: - return cudf_func(*args, **kwargs) + result = cudf_func(*args, **kwargs) + if fname == "unique": + # NumPy expects a sorted result for `unique`, which is not + # guaranteed by cudf.Index.unique. + result = result.sort_values() + return result else: return NotImplemented diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 39332807139..d28851f4ace 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1042,8 +1042,15 @@ def data_array_view( ) -> cuda.devicearray.DeviceNDArray: return self.codes.data_array_view(mode=mode) - def unique(self, preserve_order=False) -> CategoricalColumn: - codes = self.as_numerical.unique(preserve_order=preserve_order) + def unique(self, preserve_order=True) -> CategoricalColumn: + if preserve_order is not True: + warnings.warn( + "The preserve_order argument is deprecated. It will be " + "removed in a future version. As of now, unique always " + "preserves order regardless of the argument's value.", + FutureWarning, + ) + codes = self.as_numerical.unique() return column.build_categorical_column( categories=self.categories, codes=column.build_column(codes.base_data, dtype=codes.dtype), @@ -1397,9 +1404,7 @@ def _concat( head = next((obj for obj in objs if obj.valid_count), objs[0]) # Combine and de-dupe the categories - cats = column.concat_columns([o.categories for o in objs]).unique( - preserve_order=True - ) + cats = column.concat_columns([o.categories for o in objs]).unique() objs = [o._set_categories(cats, is_unique=True) for o in objs] codes = [o.codes for o in objs] @@ -1538,10 +1543,7 @@ def _set_categories( # Ensure new_categories is unique first if not (is_unique or new_cats.is_unique): - # drop_duplicates() instead of unique() to preserve order - new_cats = cudf.Series(new_cats)._column.unique( - preserve_order=True - ) + new_cats = cudf.Series(new_cats)._column.unique() cur_codes = self.codes max_cat_size = ( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 607bf83ff6c..255ac2582af 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1021,17 +1021,16 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase: ordered=dtype.ordered, ) - cats = self.unique().astype(self.dtype) + # Categories must be unique and sorted in ascending order. + cats = self.unique().sort_by_values()[0].astype(self.dtype) label_dtype = min_unsigned_type(len(cats)) labels = self._label_encoding( cats=cats, dtype=label_dtype, na_sentinel=cudf.Scalar(1) ) - # columns include null index in factorization; remove: if self.has_nulls(): cats = cats.dropna(drop_nan=False) min_type = min_unsigned_type(len(cats), 8) - labels = labels - 1 if cudf.dtype(min_type).itemsize < labels.dtype.itemsize: labels = labels.astype(min_type) @@ -1132,25 +1131,17 @@ def searchsorted( values, side, ascending=ascending, na_position=na_position ) - def unique(self, preserve_order=False) -> ColumnBase: + def unique(self, preserve_order=True) -> ColumnBase: """ Get unique values in the data """ - # TODO: We could avoid performing `drop_duplicates` for - # columns with values that already are unique. - # Few things to note before we can do this optimization is - # the following issue resolved: - # https://github.com/rapidsai/cudf/issues/5286 - if preserve_order: - ind = as_column(cupy.arange(0, len(self))) - - # dedup based on the column of data only - ind, col = drop_duplicates([ind, self], keys=[1]) - - # sort col based on ind - map = ind.argsort() - return col.take(map) - + if preserve_order is not True: + warnings.warn( + "The preserve_order argument is deprecated. It will be " + "removed in a future version. As of now, unique always " + "preserves order regardless of the argument's value.", + FutureWarning, + ) return drop_duplicates([self], keep="first")[0] def serialize(self) -> Tuple[dict, list]: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index eb6685861d4..5fc4870105b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3165,34 +3165,46 @@ def diff(self, periods=1, axis=0): @_cudf_nvtx_annotate def drop_duplicates( - self, subset=None, keep="first", inplace=False, ignore_index=False + self, + subset=None, + keep="first", + inplace=False, + ignore_index=False, ): """ - Return DataFrame with duplicate rows removed, optionally only - considering certain subset of columns. + Return DataFrame with duplicate rows removed. + + Considering certain columns is optional. Indexes, including time + indexes are ignored. Parameters ---------- subset : column label or sequence of labels, optional Only consider certain columns for identifying duplicates, by default use all of the columns. - keep : {'first', 'last', False}, default 'first' + keep : {'first', 'last', ``False``}, default 'first' Determines which duplicates (if any) to keep. - - ``first`` : Drop duplicates except for the first occurrence. - - ``last`` : Drop duplicates except for the last occurrence. - - False : Drop all duplicates. - inplace : bool, default False + - 'first' : Drop duplicates except for the first occurrence. + - 'last' : Drop duplicates except for the last occurrence. + - ``False`` : Drop all duplicates. + inplace : bool, default ``False`` Whether to drop duplicates in place or to return a copy. - ignore_index : bool, default False - If True, the resulting axis will be labeled 0, 1, …, n - 1. + ignore_index : bool, default ``False`` + If True, the resulting axis will be labeled 0, 1, ..., n - 1. Returns ------- DataFrame or None DataFrame with duplicates removed or None if ``inplace=True``. + See Also + -------- + DataFrame.value_counts: Count unique combinations of columns. + Examples -------- + Consider a dataset containing ramen ratings. + >>> import cudf >>> df = cudf.DataFrame({ ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], @@ -3207,36 +3219,34 @@ def drop_duplicates( 3 Indomie pack 15.0 4 Indomie pack 5.0 - By default, it removes duplicate rows based - on all columns. Note that order of - the rows being returned is not guaranteed - to be sorted. + By default, it removes duplicate rows based on all columns. >>> df.drop_duplicates() brand style rating + 0 Yum Yum cup 4.0 2 Indomie cup 3.5 - 4 Indomie pack 5.0 3 Indomie pack 15.0 - 0 Yum Yum cup 4.0 + 4 Indomie pack 5.0 - To remove duplicates on specific column(s), - use `subset`. + To remove duplicates on specific column(s), use ``subset``. >>> df.drop_duplicates(subset=['brand']) brand style rating - 2 Indomie cup 3.5 0 Yum Yum cup 4.0 + 2 Indomie cup 3.5 - To remove duplicates and keep last occurrences, use `keep`. + To remove duplicates and keep last occurrences, use ``keep``. >>> df.drop_duplicates(subset=['brand', 'style'], keep='last') brand style rating + 1 Yum Yum cup 4.0 2 Indomie cup 3.5 4 Indomie pack 5.0 - 1 Yum Yum cup 4.0 """ # noqa: E501 outdf = super().drop_duplicates( - subset=subset, keep=keep, ignore_index=ignore_index + subset=subset, + keep=keep, + ignore_index=ignore_index, ) return self._mimic_inplace(outdf, inplace=inplace) @@ -7693,7 +7703,7 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): # Combine and de-dupe the categories categories[idx] = cudf.Series( concat_columns([col.categories for col in cols]) - )._column.unique(preserve_order=True) + )._column.unique() # Set the column dtype to the codes' dtype. The categories # will be re-assigned at the end dtypes[idx] = min_scalar_type(len(categories[idx])) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 245c93ff17e..b7faed1dfc3 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -308,7 +308,7 @@ def dtypes(self): 2 object int64 3 object int64 """ - index = self.grouping.keys.unique().to_pandas() + index = self.grouping.keys.unique().sort_values().to_pandas() return pd.DataFrame( { name: [self.obj._dtypes[name]] * len(index) @@ -678,7 +678,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool): # subsample the gather map from the full input ordering, # rather than permuting the gather map of the output. _, (ordering,), _ = self._groupby.groups( - [arange(0, self.obj._data.nrows)] + [arange(0, len(self.obj))] ) # Invert permutation from original order to groups on the # subset of entries we want. @@ -864,25 +864,27 @@ def ngroup(self, ascending=True): 5 0 dtype: int64 """ - num_groups = len(index := self.grouping.keys.unique()) + index = self.grouping.keys.unique().sort_values() + num_groups = len(index) _, has_null_group = bitmask_or([*index._columns]) if ascending: - if has_null_group: - group_ids = cudf.Series._from_data( - {None: cp.arange(-1, num_groups - 1)} - ) - else: - group_ids = cudf.Series._from_data( - {None: cp.arange(num_groups)} - ) + # Count ascending from 0 to num_groups - 1 + group_ids = cudf.Series._from_data({None: cp.arange(num_groups)}) + elif has_null_group: + # Count descending from num_groups - 1 to 0, but subtract one more + # for the null group making it num_groups - 2 to -1. + group_ids = cudf.Series._from_data( + {None: cp.arange(num_groups - 2, -2, -1)} + ) else: + # Count descending from num_groups - 1 to 0 group_ids = cudf.Series._from_data( {None: cp.arange(num_groups - 1, -1, -1)} ) if has_null_group: - group_ids.iloc[0] = cudf.NA + group_ids.iloc[-1] = cudf.NA group_ids._index = index return self._broadcast(group_ids) @@ -1065,7 +1067,7 @@ def _grouped(self): column_names=self.obj._column_names, index_names=self.obj._index_names, ) - group_names = grouped_keys.unique() + group_names = grouped_keys.unique().sort_values() return (group_names, offsets, grouped_keys, grouped_values) def _normalize_aggs( @@ -2270,11 +2272,29 @@ def _mimic_pandas_order( """ # TODO: copy metadata after this method is a common pattern, should # merge in this method. - _, order_cols, _ = self._groupby.groups( - [arange(0, result._data.nrows)] - ) - gather_map = order_cols[0].argsort() - result = result.take(gather_map) + + # This function is used to reorder the results of scan-based + # groupbys which have the same output size as input size. + # However, if the grouping key has NAs and dropna=True, the + # result coming back from libcudf has null_count few rows than + # the input, so we must produce an ordering from the full + # input range. + _, (ordering,), _ = self._groupby.groups([arange(0, len(self.obj))]) + if self._dropna and any( + c.has_nulls(include_nan=True) > 0 + for c in self.grouping._key_columns + ): + # Scan aggregations with null/nan keys put nulls in the + # corresponding output rows in pandas, to do that here + # expand the result by reindexing. + ri = cudf.RangeIndex(0, len(self.obj)) + result.index = cudf.Index(ordering) + # This reorders and expands + result = result.reindex(ri) + else: + # Just reorder according to the groupings + result = result.take(ordering.argsort()) + # Now produce the actual index we first thought of result.index = self.obj.index return result diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 519b84faea0..8bdf0938dfb 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -3028,6 +3028,10 @@ def __init__( if copy: data = column.as_column(data, dtype=dtype).copy() kwargs = _setdefault_name(data, name=name) + + if closed is None: + closed = "right" + if isinstance(data, IntervalColumn): data = data elif isinstance(data, pd.Series) and (is_interval_dtype(data.dtype)): diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 4b784ac7b20..2055ecc96a0 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1138,7 +1138,7 @@ def _get_unique(column, dummy_na): if isinstance(column, cudf.core.column.CategoricalColumn): unique = column.categories else: - unique = column.unique() + unique = column.unique().sort_by_values()[0] if not dummy_na: if np.issubdtype(unique.dtype, np.floating): unique = unique.nans_to_nulls() diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 0e8481dd820..a99eda6bd0b 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1151,7 +1151,12 @@ def __array_function__(self, func, types, args, kwargs): try: # Apply a Series method if one exists. if cudf_func := getattr(Series, func.__name__, None): - return cudf_func(*args, **kwargs) + result = cudf_func(*args, **kwargs) + if func.__name__ == "unique": + # NumPy expects a sorted result for `unique`, which is not + # guaranteed by cudf.Series.unique. + result = result.sort_values() + return result # Assume that cupy subpackages match numpy and search the # corresponding cupy submodule based on the func's __module__. @@ -1718,20 +1723,20 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False): to be sorted. >>> s.drop_duplicates() - 3 beetle + 0 lama 1 cow + 3 beetle 5 hippo - 0 lama Name: animal, dtype: object The value 'last' for parameter `keep` keeps the last occurrence for each set of duplicated entries. >>> s.drop_duplicates(keep='last') - 3 beetle 1 cow - 5 hippo + 3 beetle 4 lama + 5 hippo Name: animal, dtype: object The value `False` for parameter `keep` discards all sets @@ -1740,8 +1745,8 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False): >>> s.drop_duplicates(keep=False, inplace=True) >>> s - 3 beetle 1 cow + 3 beetle 5 hippo Name: animal, dtype: object """ @@ -2887,9 +2892,9 @@ def unique(self): 6 c dtype: object >>> series.unique() - 0 - 1 a - 2 b + 0 a + 1 b + 2 3 c dtype: object """ diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py index 06777c8e6af..f80f6d8bb72 100644 --- a/python/cudf/cudf/tests/indexes/test_interval.py +++ b/python/cudf/cudf/tests/indexes/test_interval.py @@ -1 +1,18 @@ # Copyright (c) 2023, NVIDIA CORPORATION. +import pandas as pd +import pyarrow as pa + +import cudf +from cudf.testing._utils import assert_eq + + +def test_interval_constructor_default_closed(): + idx = cudf.IntervalIndex([pd.Interval(0, 1)]) + assert idx.closed == "right" + assert idx.dtype.closed == "right" + + +def test_interval_to_arrow(): + expect = pa.Array.from_pandas(pd.IntervalIndex([pd.Interval(0, 1)])) + got = cudf.IntervalIndex([pd.Interval(0, 1)]).to_arrow() + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py index 65874c94b93..a355ebb40b2 100644 --- a/python/cudf/cudf/tests/test_array_function.py +++ b/python/cudf/cudf/tests/test_array_function.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. import numpy as np import pandas as pd import pytest @@ -94,15 +94,26 @@ def test_array_func_missing_cudf_dataframe(pd_df, func): func(cudf_df) -# we only implement sum among all numpy non-ufuncs @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) @pytest.mark.parametrize("np_ar", [np.random.random(100)]) -@pytest.mark.parametrize("func", [lambda x: np.sum(x), lambda x: np.dot(x, x)]) +@pytest.mark.parametrize( + "func", + [ + lambda x: np.mean(x), + lambda x: np.sum(x), + lambda x: np.var(x, ddof=1), + lambda x: np.unique(x), + lambda x: np.dot(x, x), + ], +) def test_array_func_cudf_index(np_ar, func): cudf_index = cudf.core.index.as_index(cudf.Series(np_ar)) expect = func(np_ar) got = func(cudf_index) - assert_eq(expect, got) + if np.isscalar(expect): + assert_eq(expect, got) + else: + assert_eq(expect, got.to_numpy()) @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 7651382e2ac..e2af85b9e73 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -2131,6 +2131,35 @@ def test_groupby_rank_fails(): gdf.groupby(["a"]).rank(method="min", axis=1) +@pytest.mark.parametrize( + "with_nan", [False, True], ids=["just-NA", "also-NaN"] +) +@pytest.mark.parametrize("dropna", [False, True], ids=["keepna", "dropna"]) +@pytest.mark.parametrize( + "duplicate_index", [False, True], ids=["rangeindex", "dupindex"] +) +def test_groupby_scan_null_keys(with_nan, dropna, duplicate_index): + key_col = [None, 1, 2, None, 3, None, 3, 1, None, 1] + if with_nan: + df = pd.DataFrame( + {"key": pd.Series(key_col, dtype="float32"), "value": range(10)} + ) + else: + df = pd.DataFrame( + {"key": pd.Series(key_col, dtype="Int32"), "value": range(10)} + ) + + if duplicate_index: + # Non-default index with duplicates + df.index = [1, 2, 3, 1, 3, 2, 4, 1, 6, 10] + + cdf = cudf.from_pandas(df) + + expect = df.groupby("key", dropna=dropna).cumsum() + got = cdf.groupby("key", dropna=dropna).cumsum() + assert_eq(expect, got) + + def test_groupby_mix_agg_scan(): err_msg = "Cannot perform both aggregation and scan in one operation" func = ["cumsum", "sum"] @@ -3231,3 +3260,19 @@ def test_head_tail(self, df, n, take_head, expected, preserve_order): else: actual = df.groupby("a").tail(n=n, preserve_order=preserve_order) assert_eq(actual, expected) + + +def test_head_tail_empty(): + # GH #13397 + + values = [1, 2, 3] + pdf = pd.DataFrame({}, index=values) + df = cudf.DataFrame({}, index=values) + + expected = pdf.groupby(pd.Series(values)).head() + got = df.groupby(cudf.Series(values)).head() + assert_eq(expected, got) + + expected = pdf.groupby(pd.Series(values)).tail() + got = df.groupby(cudf.Series(values)).tail() + assert_eq(expected, got) diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py index e5ca2e028c3..6d5bfde7740 100644 --- a/python/cudf/cudf/tests/test_onehot.py +++ b/python/cudf/cudf/tests/test_onehot.py @@ -8,7 +8,7 @@ import cudf from cudf import DataFrame -from cudf.testing import _utils as utils +from cudf.testing._utils import assert_eq pytestmark = pytest.mark.spilling @@ -31,14 +31,14 @@ def test_get_dummies(data, index): with pytest.warns(FutureWarning): encoded_actual = cudf.get_dummies(gdf, prefix="test") - utils.assert_eq( + assert_eq( encoded_expected, encoded_actual, check_dtype=len(data) != 0, ) encoded_actual = cudf.get_dummies(gdf, prefix="test", dtype=np.uint8) - utils.assert_eq( + assert_eq( encoded_expected, encoded_actual, check_dtype=len(data) != 0, @@ -59,7 +59,7 @@ def test_onehot_get_dummies_multicol(n_cols): with pytest.warns(FutureWarning): encoded_actual = cudf.get_dummies(gdf, prefix="test") - utils.assert_eq(encoded_expected, encoded_actual) + assert_eq(encoded_expected, encoded_actual) @pytest.mark.parametrize("nan_as_null", [True, False]) @@ -75,7 +75,7 @@ def test_onehost_get_dummies_dummy_na(nan_as_null, dummy_na): if dummy_na and nan_as_null: got = got.rename(columns={"a_null": "a_nan"})[expected.columns] - utils.assert_eq(expected, got) + assert_eq(expected, got) @pytest.mark.parametrize( @@ -115,7 +115,7 @@ def test_get_dummies_prefix_sep(prefix, prefix_sep): gdf, prefix=prefix, prefix_sep=prefix_sep ) - utils.assert_eq(encoded_expected, encoded_actual) + assert_eq(encoded_expected, encoded_actual) def test_get_dummies_with_nan(): @@ -124,55 +124,55 @@ def test_get_dummies_with_nan(): ) expected = cudf.DataFrame( { - "a_null": [0, 0, 0, 1], "a_1.0": [1, 0, 0, 0], "a_2.0": [0, 1, 0, 0], "a_nan": [0, 0, 1, 0], + "a_null": [0, 0, 0, 1], }, dtype="uint8", ) with pytest.warns(FutureWarning): actual = cudf.get_dummies(df, dummy_na=True, columns=["a"]) - utils.assert_eq(expected, actual) + assert_eq(expected, actual) @pytest.mark.parametrize( "data", [ - cudf.Series(["abc", "l", "a", "abc", "z", "xyz"]), - cudf.Index([None, 1, 2, 3.3, None, 0.2]), - cudf.Series([0.1, 2, 3, None, np.nan]), - cudf.Series([23678, 324, 1, 324], name="abc"), + lambda: cudf.Series(["abc", "l", "a", "abc", "z", "xyz"]), + lambda: cudf.Index([None, 1, 2, 3.3, None, 0.2]), + lambda: cudf.Series([0.1, 2, 3, None, np.nan]), + lambda: cudf.Series([23678, 324, 1, 324], name="abc"), ], ) @pytest.mark.parametrize("prefix_sep", ["-", "#"]) @pytest.mark.parametrize("prefix", [None, "hi"]) @pytest.mark.parametrize("dtype", ["uint8", "int16"]) def test_get_dummies_array_like(data, prefix_sep, prefix, dtype): - actual = cudf.get_dummies( - data, prefix=prefix, prefix_sep=prefix_sep, dtype=dtype - ) - if isinstance(data, (cudf.Series, cudf.BaseIndex)): - pd_data = data.to_pandas() - else: - pd_data = data + data = data() + pd_data = data.to_pandas() expected = pd.get_dummies( pd_data, prefix=prefix, prefix_sep=prefix_sep, dtype=dtype ) - utils.assert_eq(expected, actual) + + actual = cudf.get_dummies( + data, prefix=prefix, prefix_sep=prefix_sep, dtype=dtype + ) + + assert_eq(expected, actual) def test_get_dummies_array_like_with_nan(): ser = cudf.Series([0.1, 2, 3, None, np.nan], nan_as_null=False) expected = cudf.DataFrame( { - "a_null": [0, 0, 0, 1, 0], "a_0.1": [1, 0, 0, 0, 0], "a_2.0": [0, 1, 0, 0, 0], "a_3.0": [0, 0, 1, 0, 0], "a_nan": [0, 0, 0, 0, 1], + "a_null": [0, 0, 0, 1, 0], }, dtype="uint8", ) @@ -181,4 +181,4 @@ def test_get_dummies_array_like_with_nan(): ser, dummy_na=True, prefix="a", prefix_sep="_" ) - utils.assert_eq(expected, actual) + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 816eb6468b0..2bddd93ccb8 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1103,8 +1103,7 @@ def test_string_unique(item): gs = cudf.Series(item) # Pandas `unique` returns a numpy array pres = pd.Series(ps.unique()) - # cudf returns sorted unique with `None` placed before other strings - pres = pres.sort_values(na_position="first").reset_index(drop=True) + # cudf returns a cudf.Series gres = gs.unique() assert_eq(pres, gres) diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index cfb951901d3..84a821aaf79 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -1,7 +1,5 @@ # Copyright (c) 2021-2023, NVIDIA CORPORATION. -import contextlib - import numpy as np import pandas as pd import pytest @@ -78,7 +76,18 @@ def test_groupby_basic(series, aggregation, pdf): # TODO: explore adding support with `.agg()` @pytest.mark.parametrize("series", [True, False]) -@pytest.mark.parametrize("aggregation", ["cumsum", "cumcount"]) +@pytest.mark.parametrize( + "aggregation", + [ + "cumsum", + pytest.param( + "cumcount", + marks=pytest.mark.xfail( + reason="https://github.com/rapidsai/cudf/issues/13390" + ), + ), + ], +) def test_groupby_cumulative(aggregation, pdf, series): gdf = cudf.DataFrame.from_pandas(pdf) ddf = dask_cudf.from_cudf(gdf, npartitions=5) @@ -90,17 +99,10 @@ def test_groupby_cumulative(aggregation, pdf, series): gdf_grouped = gdf_grouped.xx ddf_grouped = ddf_grouped.xx - if pdf.isna().sum().any(): - # https://github.com/rapidsai/cudf/issues/12055 - gdf_grouped = gdf.groupby("xx") - context = pytest.raises(ValueError) - else: - context = contextlib.nullcontext() - with context: - a = getattr(gdf_grouped, aggregation)() - b = getattr(ddf_grouped, aggregation)() + a = getattr(gdf_grouped, aggregation)() + b = getattr(ddf_grouped, aggregation)() - dd.assert_eq(a, b) + dd.assert_eq(a, b) @pytest.mark.parametrize("aggregation", OPTIMIZED_AGGS) @@ -136,7 +138,6 @@ def test_groupby_agg(func, aggregation, pdf): @pytest.mark.parametrize("split_out", [1, 3]) def test_groupby_agg_empty_partition(tmpdir, split_out): - # Write random and empty cudf DataFrames # to two distinct files. df = cudf.datasets.randomdata() @@ -496,7 +497,6 @@ def test_groupby_mean_sort_false(): def test_groupby_reset_index_dtype(): - # Make sure int8 dtype is properly preserved # Through various cudf/dask_cudf ops #