From 87d3c26d5925f574008ab4a9a5ceb26352b980e9 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 11 Oct 2021 21:18:02 -0600 Subject: [PATCH 1/2] Add keep option, and fix the code path --- cpp/include/cudf/stream_compaction.hpp | 7 +++-- cpp/src/stream_compaction/drop_duplicates.cu | 29 +++++++++++++++----- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index 192be4fb6a9..7b275d202a7 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -208,9 +208,10 @@ std::unique_ptr apply_boolean_mask( * @brief Choices for drop_duplicates API for retainment of duplicate rows */ enum class duplicate_keep_option { - KEEP_FIRST = 0, ///< Keeps first duplicate row and unique rows - KEEP_LAST, ///< Keeps last duplicate row and unique rows - KEEP_NONE ///< Keeps only unique rows are kept + KEEP_FIRST = 0, ///< Keeps first duplicate element and unique elements + KEEP_LAST, ///< Keeps last duplicate element and unique elements + KEEP_ANY, ///< Keeps one duplicate element at any position and unique elements + KEEP_NONE ///< Keeps only unique elements }; /** diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu index 30ea32fba8e..26fdcff2ec9 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/drop_duplicates.cu @@ -137,13 +137,28 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys, null_order null_precedence, rmm::cuda_stream_view stream) { - // sort only indices - auto sorted_indices = sorted_order( - keys, - std::vector{}, - std::vector{static_cast(keys.num_columns()), null_precedence}, - stream, - rmm::mr::get_current_device_resource()); + // Sort only the indices. + // Note that if `keep` is given as `KEEP_ANY` or `KEEP_NONE`, we just use unstable sort for better + // performance. Otherwise, stable sort must be used. + auto sorted_indices = + keep == duplicate_keep_option::KEEP_ANY || keep == duplicate_keep_option::KEEP_NONE + ? sorted_order( + keys, + std::vector{}, + std::vector{static_cast(keys.num_columns()), null_precedence}, + stream, + rmm::mr::get_current_device_resource()) + : stable_sorted_order( + keys, + std::vector{}, + std::vector{static_cast(keys.num_columns()), null_precedence}, + stream, + rmm::mr::get_current_device_resource()); + + // From now, if `keep` is given as `KEEP_ANY`, just set it to `KEEP_FIRST`. + // Note that this will not make the results be the same as if `keep` is set to `KEEP_FIRST` since + // we have used unstable sort for sorting the indices. + if (keep == duplicate_keep_option::KEEP_ANY) { keep = duplicate_keep_option::KEEP_FIRST; } // extract unique indices auto device_input_table = cudf::table_device_view::create(keys, stream); From 4b2de5a6416eabdf0e24911d61829c599011409d Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 15 Oct 2021 10:22:15 -0600 Subject: [PATCH 2/2] Remove `KEEP_ANY` option and use stable sort all the time --- cpp/include/cudf/stream_compaction.hpp | 1 - cpp/src/stream_compaction/drop_duplicates.cu | 28 +++++--------------- 2 files changed, 7 insertions(+), 22 deletions(-) diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp index 7b275d202a7..7551511d281 100644 --- a/cpp/include/cudf/stream_compaction.hpp +++ b/cpp/include/cudf/stream_compaction.hpp @@ -210,7 +210,6 @@ std::unique_ptr
apply_boolean_mask( enum class duplicate_keep_option { KEEP_FIRST = 0, ///< Keeps first duplicate element and unique elements KEEP_LAST, ///< Keeps last duplicate element and unique elements - KEEP_ANY, ///< Keeps one duplicate element at any position and unique elements KEEP_NONE ///< Keeps only unique elements }; diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu index 26fdcff2ec9..5ec043ff1f9 100644 --- a/cpp/src/stream_compaction/drop_duplicates.cu +++ b/cpp/src/stream_compaction/drop_duplicates.cu @@ -138,27 +138,13 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys, rmm::cuda_stream_view stream) { // Sort only the indices. - // Note that if `keep` is given as `KEEP_ANY` or `KEEP_NONE`, we just use unstable sort for better - // performance. Otherwise, stable sort must be used. - auto sorted_indices = - keep == duplicate_keep_option::KEEP_ANY || keep == duplicate_keep_option::KEEP_NONE - ? sorted_order( - keys, - std::vector{}, - std::vector{static_cast(keys.num_columns()), null_precedence}, - stream, - rmm::mr::get_current_device_resource()) - : stable_sorted_order( - keys, - std::vector{}, - std::vector{static_cast(keys.num_columns()), null_precedence}, - stream, - rmm::mr::get_current_device_resource()); - - // From now, if `keep` is given as `KEEP_ANY`, just set it to `KEEP_FIRST`. - // Note that this will not make the results be the same as if `keep` is set to `KEEP_FIRST` since - // we have used unstable sort for sorting the indices. - if (keep == duplicate_keep_option::KEEP_ANY) { keep = duplicate_keep_option::KEEP_FIRST; } + // Note that stable sort must be used to maintain the order of duplicate elements. + auto sorted_indices = stable_sorted_order( + keys, + std::vector{}, + std::vector{static_cast(keys.num_columns()), null_precedence}, + stream, + rmm::mr::get_current_device_resource()); // extract unique indices auto device_input_table = cudf::table_device_view::create(keys, stream);