From 08ac1eb7832fe99f44b25f192d9931d393a96983 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 2 Apr 2024 08:27:49 -1000 Subject: [PATCH 001/842] Bump ruff and codespell pre-commit checks (#15407) xref https://github.com/rapidsai/cudf/pull/15345#discussion_r1532379047 Before pursuing migrating isort to ruff, bumping ruff to the latest version Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15407 --- .pre-commit-config.yaml | 4 ++-- cpp/include/cudf/io/detail/parquet.hpp | 4 ++-- cpp/src/copying/contiguous_split.cu | 2 +- cpp/src/io/orc/aggregate_orc_metadata.cpp | 2 +- pyproject.toml | 8 +++++--- python/cudf/benchmarks/common/config.py | 3 ++- python/cudf/cudf/_fuzz_testing/utils.py | 6 +++--- python/cudf/cudf/core/buffer/buffer.py | 2 +- python/cudf/cudf/core/buffer/spillable_buffer.py | 2 +- python/cudf/cudf/core/column/__init__.py | 1 - python/cudf/cudf/core/column/methods.py | 12 ++++-------- python/cudf/cudf/core/column/string.py | 6 ++---- python/cudf/cudf/io/parquet.py | 6 +++--- .../cudf/pandas/scripts/analyze-test-failures.py | 3 ++- .../cudf/pandas/scripts/summarize-test-results.py | 3 ++- python/cudf/cudf/tests/test_index.py | 1 + python/cudf/cudf/tests/test_monotonic.py | 1 + python/cudf/cudf/tests/test_multiindex.py | 1 + python/cudf/cudf/utils/docutils.py | 1 + python/cudf/cudf/utils/dtypes.py | 2 +- 20 files changed, 36 insertions(+), 34 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 06fdcb9f761..3e99cf3fa9a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -113,7 +113,7 @@ repos: pass_filenames: false verbose: true - repo: https://github.com/codespell-project/codespell - rev: v2.2.2 + rev: v2.2.6 hooks: - id: codespell additional_dependencies: [tomli] @@ -129,7 +129,7 @@ repos: - id: rapids-dependency-file-generator args: ["--clean"] - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.13 + rev: v0.3.4 hooks: - id: ruff files: python/.*$ diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp index 0b8ee9676de..df870f6f1e4 100644 --- a/cpp/include/cudf/io/detail/parquet.hpp +++ b/cpp/include/cudf/io/detail/parquet.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -110,7 +110,7 @@ class chunked_reader : private reader { * The chunk_read_limit parameter controls the size of the output chunks produces. If the user * specifies 100 MB of data, the reader will attempt to return chunks containing tables that have * a total bytes size (over all columns) of 100 MB or less. This is a soft limit and the code - * will not fail if it cannot satisfy the limit. It will make a best-effort atttempt only. + * will not fail if it cannot satisfy the limit. It will make a best-effort attempt only. * * The pass_read_limit parameter controls how much temporary memory is used in the process of * decoding the file. The primary contributor to this memory usage is the uncompressed size of diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 23224d3225d..23bcd344a32 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -1139,7 +1139,7 @@ struct packed_src_and_dst_pointers { /** * @brief Create an instance of `packed_src_and_dst_pointers` populating destination - * partitition buffers (if any) from `out_buffers`. In the chunked_pack case + * partition buffers (if any) from `out_buffers`. In the chunked_pack case * `out_buffers` is empty, and the destination pointer is provided separately * to the `copy_partitions` kernel. * diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp index f5f540bc3a4..d54524f0f0d 100644 --- a/cpp/src/io/orc/aggregate_orc_metadata.cpp +++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp @@ -194,7 +194,7 @@ aggregate_orc_metadata::select_stripes( } else { int64_t count = 0; int64_t stripe_skip_rows = 0; - // Iterate all source files, each source file has corelating metadata + // Iterate all source files, each source file has correlating metadata for (size_t src_file_idx = 0; src_file_idx < per_file_metadata.size() && count < rows_to_skip + rows_to_read; ++src_file_idx) { diff --git a/pyproject.toml b/pyproject.toml index 28eac66c1d6..797b5374cb6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,11 +19,14 @@ exclude = [ skip = "./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,./cpp/tests,./python/cudf/cudf/tests,./java/src/test,./cpp/include/cudf_test/cxxopts.hpp" # ignore short words, and typename parameters like OffsetT ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b" -ignore-words-list = "inout,unparseable,falsy" +ignore-words-list = "inout,unparseable,falsy,couldn,Couldn" builtin = "clear" quiet-level = 3 [tool.ruff] +line-length = 79 + +[tool.ruff.lint] select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418"] ignore = [ # whitespace before : @@ -36,9 +39,8 @@ exclude = [ # TODO: Remove this in a follow-up where we fix __all__. "__init__.py", ] -line-length = 79 -[tool.ruff.per-file-ignores] +[tool.ruff.lint.per-file-ignores] # Lots of pytest implicitly injected attributes in conftest-patch.py "python/cudf/cudf/pandas/scripts/conftest-patch.py" = ["F821"] "python/cudf/cudf/pandas/scripts/*" = ["D"] diff --git a/python/cudf/benchmarks/common/config.py b/python/cudf/benchmarks/common/config.py index 305a21d0a29..c1e9d4d6116 100644 --- a/python/cudf/benchmarks/common/config.py +++ b/python/cudf/benchmarks/common/config.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. """Module used for global configuration of benchmarks. @@ -20,6 +20,7 @@ in this file and import them in conftest.py to ensure that they are handled appropriately. """ + import os import sys diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index 6e53195ac2d..d685174f3c2 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -99,9 +99,9 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None): low=1, high=10 ) else: - meta[ - "max_types_at_each_level" - ] = obj._max_struct_types_at_each_level + meta["max_types_at_each_level"] = ( + obj._max_struct_types_at_each_level + ) elif dtype == "decimal64": meta["max_precision"] = cudf.Decimal64Dtype.MAX_PRECISION diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 8d278c9c065..1631fa00412 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -181,7 +181,7 @@ def _from_host_memory(cls, data: Any) -> Self: Parameters ---------- data : Any - An object that represens host memory. + An object that represents host memory. Returns ------- diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index b25af13679c..a9569190e75 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -154,7 +154,7 @@ def _from_host_memory(cls, data: Any) -> Self: Parameters ---------- data : Any - An object that represens host memory. + An object that represents host memory. Returns ------- diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index 2a46654ccc2..e7119fcdf47 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -4,7 +4,6 @@ isort: skip_file """ - from cudf.core.column.categorical import CategoricalColumn from cudf.core.column.column import ( ColumnBase, diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index 0f5a0eb086b..e827c7a3dd3 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -26,8 +26,7 @@ def _return_or_inplace( inplace: Literal[True], expand: bool = False, retain_index: bool = True, - ) -> None: - ... + ) -> None: ... @overload def _return_or_inplace( @@ -36,8 +35,7 @@ def _return_or_inplace( inplace: Literal[False], expand: bool = False, retain_index: bool = True, - ) -> ParentType: - ... + ) -> ParentType: ... @overload def _return_or_inplace( @@ -45,8 +43,7 @@ def _return_or_inplace( new_col, expand: bool = False, retain_index: bool = True, - ) -> ParentType: - ... + ) -> ParentType: ... @overload def _return_or_inplace( @@ -55,8 +52,7 @@ def _return_or_inplace( inplace: bool = False, expand: bool = False, retain_index: bool = True, - ) -> Optional[ParentType]: - ... + ) -> Optional[ParentType]: ... def _return_or_inplace( self, new_col, inplace=False, expand=False, retain_index=True diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index fb76fcdaf39..06d7aa030db 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -257,14 +257,12 @@ def byte_count(self) -> SeriesOrIndex: @overload def cat( self, sep: Optional[str] = None, na_rep: Optional[str] = None - ) -> str: - ... + ) -> str: ... @overload def cat( self, others, sep: Optional[str] = None, na_rep: Optional[str] = None - ) -> Union[SeriesOrIndex, "cudf.core.column.string.StringColumn"]: - ... + ) -> Union[SeriesOrIndex, "cudf.core.column.string.StringColumn"]: ... def cat(self, others=None, sep=None, na_rep=None): """ diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index bead9c352ef..e55898de675 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -1220,9 +1220,9 @@ def __init__( ) -> None: if isinstance(path, str) and path.startswith("s3://"): self.fs_meta = {"is_s3": True, "actual_path": path} - self.dir_: Optional[ - tempfile.TemporaryDirectory - ] = tempfile.TemporaryDirectory() + self.dir_: Optional[tempfile.TemporaryDirectory] = ( + tempfile.TemporaryDirectory() + ) self.path = self.dir_.name else: self.fs_meta = {} diff --git a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py index f1744c9e92b..8870fbc5c28 100644 --- a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py +++ b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -11,6 +11,7 @@ Example: python analyze-test-failures.py log.json frame/* """ + import json import sys from collections import Counter diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py index bfc56319d82..ffd2abb960d 100644 --- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py +++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 @@ -10,6 +10,7 @@ python summarize-test-results.py log.json --output json python summarize-test-results.py log.json --output table """ + import argparse import json diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 05213d7601c..ebbca57bd40 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -3,6 +3,7 @@ """ Test related to Index """ + import datetime import operator import re diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py index 53919a95115..3c627a5fe89 100644 --- a/python/cudf/cudf/tests/test_monotonic.py +++ b/python/cudf/cudf/tests/test_monotonic.py @@ -4,6 +4,7 @@ Tests related to is_unique, is_monotonic_increasing & is_monotonic_decreasing attributes """ + import numpy as np import pandas as pd import pytest diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 4926d79e734..76a82afb78e 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -3,6 +3,7 @@ """ Test related to MultiIndex """ + import datetime import itertools import operator diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py index 68447f423a4..4136d97d69f 100644 --- a/python/cudf/cudf/utils/docutils.py +++ b/python/cudf/cudf/utils/docutils.py @@ -3,6 +3,7 @@ """ Helper functions for parameterized docstring """ + import functools import re import string diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index e9dbc23d767..8521239413e 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -587,7 +587,7 @@ def find_common_type(dtypes): def _dtype_pandas_compatible(dtype): """ A utility function, that returns `str` instead of `object` - dtype when pandas comptibility mode is enabled. + dtype when pandas compatibility mode is enabled. """ if cudf.get_option("mode.pandas_compatible") and dtype == cudf.dtype("O"): return "str" From 08d86c92b3e3ccd950e4d63033d44675510cbb74 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 2 Apr 2024 12:29:43 -0700 Subject: [PATCH 002/842] Fix errors in chunked ORC writer when no tables were (successfully) written (#15393) Closes https://github.com/rapidsai/cudf/issues/15386, https://github.com/rapidsai/cudf/issues/15387 The fixes for the two issues overlap, so I included both in a single PR. Expanded the `_closed` flag to an enum that tracks if the operations in `close()` should be performed (one or more tables were written to the sink). This way, we don't perform the steps in close when there is no valid file to write the footer for. This includes: - No `write` calls; - All `write` calls failed; The new enum replaces `skip_close()` that used to fix this issue for a smaller subset of cases. Additionally, writing of the ORC header has been moved after the encode and uses the new state to only write the header in the first `write` call. This way we don't write anything to the sink if there were no `write` calls with the writer, and if the encode failed in the `write`s. Authors: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) Approvers: - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/15393 --- cpp/include/cudf/io/detail/orc.hpp | 8 ----- cpp/src/io/functions.cpp | 11 +----- cpp/src/io/orc/writer_impl.cu | 29 +++++++-------- cpp/src/io/orc/writer_impl.hpp | 20 +++++------ cpp/tests/io/orc_test.cpp | 58 +++++++++++++++++++++++++++--- 5 files changed, 79 insertions(+), 47 deletions(-) diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp index 3c1486b60c2..c63c952e148 100644 --- a/cpp/include/cudf/io/detail/orc.hpp +++ b/cpp/include/cudf/io/detail/orc.hpp @@ -124,14 +124,6 @@ class writer { * @brief Finishes the chunked/streamed write process. */ void close(); - - /** - * @brief Skip work done in `close()`; should be called if `write()` failed. - * - * Calling skip_close() prevents the writer from writing the (invalid) file footer and the - * postscript. - */ - void skip_close(); }; } // namespace orc::detail } // namespace cudf::io diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index b8353d312fe..46c6c67c8df 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -436,16 +436,7 @@ void write_orc(orc_writer_options const& options, rmm::cuda_stream_view stream) auto writer = std::make_unique( std::move(sinks[0]), options, io_detail::single_write_mode::YES, stream); - try { - writer->write(options.get_table()); - } catch (...) { - // If an exception is thrown, the output is incomplete/corrupted. - // Make sure the writer will not close with such corrupted data. - // In addition, the writer may throw an exception while trying to close, which would terminate - // the process. - writer->skip_close(); - throw; - } + writer->write(options.get_table()); } /** diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index ade0e75de35..750a593920c 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -2438,7 +2438,6 @@ writer::impl::impl(std::unique_ptr sink, if (options.get_metadata()) { _table_meta = std::make_unique(*options.get_metadata()); } - init_state(); } writer::impl::impl(std::unique_ptr sink, @@ -2460,20 +2459,13 @@ writer::impl::impl(std::unique_ptr sink, if (options.get_metadata()) { _table_meta = std::make_unique(*options.get_metadata()); } - init_state(); } writer::impl::~impl() { close(); } -void writer::impl::init_state() -{ - // Write file header - _out_sink->host_write(MAGIC, std::strlen(MAGIC)); -} - void writer::impl::write(table_view const& input) { - CUDF_EXPECTS(not _closed, "Data has already been flushed to out and closed"); + CUDF_EXPECTS(_state != writer_state::CLOSED, "Data has already been flushed to out and closed"); if (not _table_meta) { _table_meta = make_table_meta(input); } @@ -2516,6 +2508,11 @@ void writer::impl::write(table_view const& input) } }(); + if (_state == writer_state::NO_DATA_WRITTEN) { + // Write the ORC file header if this is the first write + _out_sink->host_write(MAGIC, std::strlen(MAGIC)); + } + // Compression/encoding were all successful. Now write the intermediate results. write_orc_data_to_sink(enc_data, segmentation, @@ -2533,6 +2530,8 @@ void writer::impl::write(table_view const& input) // Update file-level and compression statistics update_statistics(orc_table.num_rows(), std::move(intermediate_stats), compression_stats); + + _state = writer_state::DATA_WRITTEN; } void writer::impl::update_statistics( @@ -2683,8 +2682,11 @@ void writer::impl::add_table_to_footer_data(orc_table_view const& orc_table, void writer::impl::close() { - if (_closed) { return; } - _closed = true; + if (_state != writer_state::DATA_WRITTEN) { + // writer is either closed or no data has been written + _state = writer_state::CLOSED; + return; + } PostScript ps; if (_stats_freq != statistics_freq::STATISTICS_NONE) { @@ -2769,6 +2771,8 @@ void writer::impl::close() pbw.put_byte(ps_length); _out_sink->host_write(pbw.data(), pbw.size()); _out_sink->flush(); + + _state = writer_state::CLOSED; } // Forward to implementation @@ -2795,9 +2799,6 @@ writer::~writer() = default; // Forward to implementation void writer::write(table_view const& table) { _impl->write(table); } -// Forward to implementation -void writer::skip_close() { _impl->skip_close(); } - // Forward to implementation void writer::close() { _impl->close(); } diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp index 417d29efb58..bd082befe0c 100644 --- a/cpp/src/io/orc/writer_impl.hpp +++ b/cpp/src/io/orc/writer_impl.hpp @@ -227,6 +227,14 @@ struct encoded_footer_statistics { std::vector file_level; }; +enum class writer_state { + NO_DATA_WRITTEN, // No table data has been written to the sink; if the writer is closed or + // destroyed in this state, it should not write the footer. + DATA_WRITTEN, // At least one table has been written to the sink; when the writer is closed, + // it should write the footer. + CLOSED // Writer has been closed; no further writes are allowed. +}; + /** * @brief Implementation for ORC writer */ @@ -266,11 +274,6 @@ class writer::impl { */ ~impl(); - /** - * @brief Begins the chunked/streamed write process. - */ - void init_state(); - /** * @brief Writes a single subtable as part of a larger ORC file/table write. * @@ -283,11 +286,6 @@ class writer::impl { */ void close(); - /** - * @brief Skip writing the footer when closing/deleting the writer. - */ - void skip_close() { _closed = true; } - private: /** * @brief Write the intermediate ORC data into the data sink. @@ -363,7 +361,7 @@ class writer::impl { Footer _footer; Metadata _orc_meta; persisted_statistics _persisted_stripe_statistics; // Statistics data saved between calls. - bool _closed = false; // To track if the output has been written to sink. + writer_state _state = writer_state::NO_DATA_WRITTEN; }; } // namespace cudf::io::orc::detail diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 24e2e2cfea0..e108e68e1f9 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -2100,8 +2101,7 @@ TEST_F(OrcWriterTest, BounceBufferBug) auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 100; }); constexpr auto num_rows = 150000; - column_wrapper col(sequence, - sequence + num_rows); + column_wrapper col(sequence, sequence + num_rows); table_view expected({col}); auto filepath = temp_env->get_temp_filepath("BounceBufferBug.orc"); @@ -2120,8 +2120,7 @@ TEST_F(OrcReaderTest, SizeTypeRowsOverflow) static_assert(total_rows > std::numeric_limits::max()); auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 127; }); - column_wrapper col(sequence, - sequence + num_rows); + column_wrapper col(sequence, sequence + num_rows); table_view chunk_table({col}); std::vector out_buffer; @@ -2169,4 +2168,55 @@ TEST_F(OrcReaderTest, SizeTypeRowsOverflow) CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_with_stripe_selection->view()); } +TEST_F(OrcChunkedWriterTest, NoWriteCloseNotThrow) +{ + std::vector out_buffer; + + cudf::io::chunked_orc_writer_options write_opts = + cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&out_buffer}); + auto writer = cudf::io::orc_chunked_writer(write_opts); + + EXPECT_NO_THROW(writer.close()); +} + +TEST_F(OrcChunkedWriterTest, FailedWriteCloseNotThrow) +{ + // A sink that throws on write() + class throw_sink : public cudf::io::data_sink { + public: + void host_write(void const* data, size_t size) override { throw std::runtime_error("write"); } + void flush() override {} + size_t bytes_written() override { return 0; } + }; + + auto sequence = thrust::make_counting_iterator(0); + column_wrapper col(sequence, sequence + 10); + table_view table({col}); + + throw_sink sink; + cudf::io::chunked_orc_writer_options write_opts = + cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&sink}); + auto writer = cudf::io::orc_chunked_writer(write_opts); + + try { + writer.write(table); + } catch (...) { + // ignore the exception; we're testing that close() doesn't throw when the only write() fails + } + + EXPECT_NO_THROW(writer.close()); +} + +TEST_F(OrcChunkedWriterTest, NoDataInSinkWhenNoWrite) +{ + std::vector out_buffer; + + cudf::io::chunked_orc_writer_options write_opts = + cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&out_buffer}); + auto writer = cudf::io::orc_chunked_writer(write_opts); + + EXPECT_NO_THROW(writer.close()); + EXPECT_EQ(out_buffer.size(), 0); +} + CUDF_TEST_PROGRAM_MAIN() From 13a5c7be33bec538a9f81872471c29796e67bce5 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 2 Apr 2024 16:54:09 -0400 Subject: [PATCH 003/842] Rework cudf::replace_nulls to use strings::detail::copy_if_else (#15286) Removes the specialized kernels for strings in `cudf::replace_nulls` and replaces them with a call to `cudf::strings::detail::copy_if_else` which is already enabled with offsetalator support and optimized for long strings. This will also allow `cudf::replace_nulls` to use large strings with no further changes. Also includes a `replace_nulls` benchmark for strings. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15286 --- cpp/benchmarks/CMakeLists.txt | 3 +- cpp/benchmarks/replace/nulls.cpp | 59 ++++++++++++++ cpp/src/replace/nulls.cu | 127 +++++-------------------------- 3 files changed, 79 insertions(+), 110 deletions(-) create mode 100644 cpp/benchmarks/replace/nulls.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index c82e475dece..798e4e76141 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -208,8 +208,9 @@ ConfigureNVBench( ) # ################################################################################################## -# * reduction benchmark --------------------------------------------------------------------------- +# * replace benchmark --------------------------------------------------------------------------- ConfigureBench(REPLACE_BENCH replace/clamp.cpp replace/nans.cpp) +ConfigureNVBench(REPLACE_NVBENCH replace/nulls.cpp) # ################################################################################################## # * filling benchmark ----------------------------------------------------------------------------- diff --git a/cpp/benchmarks/replace/nulls.cpp b/cpp/benchmarks/replace/nulls.cpp new file mode 100644 index 00000000000..ccd00050789 --- /dev/null +++ b/cpp/benchmarks/replace/nulls.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +static void replace_nulls(nvbench::state& state) +{ + auto const n_rows = static_cast(state.get_int64("num_rows")); + auto const max_width = static_cast(state.get_int64("row_width")); + + if (static_cast(n_rows) * static_cast(max_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + data_profile const table_profile = data_profile_builder().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, max_width); + + auto const input_table = create_random_table( + {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{n_rows}, table_profile); + auto const input = input_table->view().column(0); + auto const repl = input_table->view().column(1); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + auto chars_size = cudf::strings_column_view(input).chars_size(cudf::get_default_stream()); + state.add_global_memory_reads(chars_size); // all bytes are read; + state.add_global_memory_writes(chars_size); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto result = cudf::replace_nulls(input, repl); }); +} + +NVBENCH_BENCH(replace_nulls) + .set_name("replace_nulls") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048}) + .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216}); diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index 014171f2b40..299cdc6a160 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -32,8 +32,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -56,63 +56,6 @@ namespace { // anonymous static constexpr int BLOCK_SIZE = 256; -template -CUDF_KERNEL void replace_nulls_strings(cudf::column_device_view input, - cudf::column_device_view replacement, - cudf::bitmask_type* output_valid, - cudf::size_type* offsets, - char* chars, - cudf::size_type* valid_counter) -{ - cudf::size_type nrows = input.size(); - auto i = cudf::detail::grid_1d::global_thread_id(); - auto const stride = cudf::detail::grid_1d::grid_stride(); - - uint32_t active_mask = 0xffff'ffff; - active_mask = __ballot_sync(active_mask, i < nrows); - auto const lane_id{threadIdx.x % cudf::detail::warp_size}; - uint32_t valid_sum{0}; - - while (i < nrows) { - bool input_is_valid = input.is_valid_nocheck(i); - bool output_is_valid = true; - - if (replacement_has_nulls && !input_is_valid) { - output_is_valid = replacement.is_valid_nocheck(i); - } - - cudf::string_view out; - if (input_is_valid) { - out = input.element(i); - } else if (output_is_valid) { - out = replacement.element(i); - } - - bool nonzero_output = (input_is_valid || output_is_valid); - - if (phase == 0) { - offsets[i] = nonzero_output ? out.size_bytes() : 0; - uint32_t bitmask = __ballot_sync(active_mask, output_is_valid); - if (0 == lane_id) { - output_valid[cudf::word_index(i)] = bitmask; - valid_sum += __popc(bitmask); - } - } else if (phase == 1) { - if (nonzero_output) std::memcpy(chars + offsets[i], out.data(), out.size_bytes()); - } - - i += stride; - active_mask = __ballot_sync(active_mask, i < nrows); - } - - // Compute total valid count for this block and add it to global count - uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce(valid_sum); - // one thread computes and adds to output_valid_count - if (threadIdx.x == 0) { - atomicAdd(valid_counter, static_cast(block_valid_count)); - } -} - template CUDF_KERNEL void replace_nulls(cudf::column_device_view input, cudf::column_device_view replacement, @@ -222,58 +165,24 @@ std::unique_ptr replace_nulls_column_kernel_forwarder::operator()< rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - rmm::device_scalar valid_counter(0, stream); - cudf::size_type* valid_count = valid_counter.data(); - - auto replace_first = replace_nulls_strings<0, false>; - auto replace_second = replace_nulls_strings<1, false>; - if (replacement.has_nulls()) { - replace_first = replace_nulls_strings<0, true>; - replace_second = replace_nulls_strings<1, true>; + auto d_input = cudf::column_device_view::create(input, stream); + auto d_replacement = cudf::column_device_view::create(replacement, stream); + + auto lhs_iter = + cudf::detail::make_optional_iterator(*d_input, cudf::nullate::YES{}); + auto rhs_iter = cudf::detail::make_optional_iterator( + *d_replacement, cudf::nullate::DYNAMIC{replacement.nullable()}); + + auto filter = cudf::detail::validity_accessor{*d_input}; + auto result = cudf::strings::detail::copy_if_else( + lhs_iter, lhs_iter + input.size(), rhs_iter, filter, stream, mr); + + // input is nullable so result should always be nullable here + if (!result->nullable()) { + result->set_null_mask( + cudf::detail::create_null_mask(input.size(), cudf::mask_state::ALL_VALID, stream, mr), 0); } - - // Create new offsets column to use in kernel - std::unique_ptr sizes = cudf::make_numeric_column( - cudf::data_type(cudf::type_id::INT32), input.size(), cudf::mask_state::UNALLOCATED, stream); - - auto sizes_view = sizes->mutable_view(); - auto device_in = cudf::column_device_view::create(input, stream); - auto device_replacement = cudf::column_device_view::create(replacement, stream); - - rmm::device_buffer valid_bits = - cudf::detail::create_null_mask(input.size(), cudf::mask_state::UNINITIALIZED, stream, mr); - - // Call first pass kernel to get sizes in offsets - cudf::detail::grid_1d grid{input.size(), BLOCK_SIZE, 1}; - replace_first<<>>( - *device_in, - *device_replacement, - reinterpret_cast(valid_bits.data()), - sizes_view.begin(), - nullptr, - valid_count); - - auto [offsets, bytes] = cudf::detail::make_offsets_child_column( - sizes_view.begin(), sizes_view.end(), stream, mr); - - auto offsets_view = offsets->mutable_view(); - - // Allocate chars array and output null mask - rmm::device_uvector output_chars(bytes, stream, mr); - - replace_second<<>>( - *device_in, - *device_replacement, - reinterpret_cast(valid_bits.data()), - offsets_view.begin(), - output_chars.data(), - valid_count); - - return cudf::make_strings_column(input.size(), - std::move(offsets), - output_chars.release(), - input.size() - valid_counter.value(stream), - std::move(valid_bits)); + return result; } template <> From 2584fd9d1e1fffb2aefd0417ba0994d7a563e076 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 2 Apr 2024 16:39:46 -0700 Subject: [PATCH 004/842] Test static builds in CI and fix nanoarrow configure (#15437) Resolves #15275 Resolves #15434 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Jake Awe (https://github.com/AyodeAwe) - Robert Maynard (https://github.com/robertmaynard) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15437 --- .github/workflows/pr.yaml | 11 ++ .github/workflows/test.yaml | 10 ++ ci/configure_cpp_static.sh | 23 +++ cpp/cmake/thirdparty/get_nanoarrow.cmake | 20 +++ .../thirdparty/patches/nanoarrow_cmake.diff | 161 ++++++++++++++++++ dependencies.yaml | 18 +- 6 files changed, 239 insertions(+), 4 deletions(-) create mode 100755 ci/configure_cpp_static.sh create mode 100644 cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 303988212d3..2d7ebb62fa8 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -20,6 +20,7 @@ jobs: - conda-python-cudf-tests - conda-python-other-tests - conda-java-tests + - static-configure - conda-notebook-tests - docs-build - wheel-build-cudf @@ -88,6 +89,16 @@ jobs: arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_java.sh" + static-configure: + needs: checks + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06 + with: + build_type: pull-request + # Use the wheel container so we can skip conda solves and since our + # primary static consumers (Spark) are not in conda anyway. + container_image: "rapidsai/ci-wheel:latest" + run_script: "ci/configure_cpp_static.sh" conda-notebook-tests: needs: conda-python-build secrets: inherit diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 6f7aef79881..ea47b6ad466 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -43,6 +43,16 @@ jobs: arch: "amd64" container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_cpp_memcheck.sh" + static-configure: + needs: checks + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06 + with: + build_type: pull-request + # Use the wheel container so we can skip conda solves and since our + # primary static consumers (Spark) are not in conda anyway. + container_image: "rapidsai/ci-wheel:latest" + run_script: "ci/configure_cpp_static.sh" conda-python-cudf-tests: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06 diff --git a/ci/configure_cpp_static.sh b/ci/configure_cpp_static.sh new file mode 100755 index 00000000000..675e0c3981f --- /dev/null +++ b/ci/configure_cpp_static.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +rapids-configure-conda-channels + +source rapids-date-string + +rapids-logger "Configure static cpp build" + +ENV_YAML_DIR="$(mktemp -d)" +REQUIREMENTS_FILE="${ENV_YAML_DIR}/requirements.txt" + +rapids-dependency-file-generator \ + --output requirements \ + --file_key test_static_build \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${REQUIREMENTS_FILE}" + +python -m pip install -r "${REQUIREMENTS_FILE}" +pyenv rehash + +cmake -S cpp -B build_static -GNinja -DBUILD_SHARED_LIBS=OFF -DBUILD_TESTS=OFF diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake index be938a89ccd..4316db99a8d 100644 --- a/cpp/cmake/thirdparty/get_nanoarrow.cmake +++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake @@ -17,6 +17,25 @@ function(find_and_configure_nanoarrow) set(oneValueArgs VERSION FORK PINNED_TAG) cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + # Only run if PKG_VERSION is < 0.5.0 + if(PKG_VERSION VERSION_LESS 0.5.0) + set(patch_files_to_run "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches/nanoarrow_cmake.diff") + set(patch_issues_to_ref + "Fix issues with nanoarrow CMake [https://github.com/apache/arrow-nanoarrow/pull/406]" + ) + set(patch_script "${CMAKE_BINARY_DIR}/rapids-cmake/patches/nanoarrow/patch.cmake") + set(log_file "${CMAKE_BINARY_DIR}/rapids-cmake/patches/nanoarrow/log") + string(TIMESTAMP current_year "%Y" UTC) + configure_file( + ${rapids-cmake-dir}/cpm/patches/command_template.cmake.in "${patch_script}" @ONLY + ) + else() + message( + FATAL_ERROR + "Nanoarrow version ${PKG_VERSION} already contains the necessary patch. Please remove this patch from cudf." + ) + endif() + rapids_cpm_find( nanoarrow ${PKG_VERSION} GLOBAL_TARGETS nanoarrow @@ -26,6 +45,7 @@ function(find_and_configure_nanoarrow) # TODO: Commit hashes are not supported with shallow clones. Can switch this if and when we pin # to an actual tag. GIT_SHALLOW FALSE + PATCH_COMMAND ${CMAKE_COMMAND} -P ${patch_script} OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ) set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff b/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff new file mode 100644 index 00000000000..b53e134ed2c --- /dev/null +++ b/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff @@ -0,0 +1,161 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 8714c70..1feec13 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -49,7 +49,6 @@ else() + endif() + + option(NANOARROW_CODE_COVERAGE "Enable coverage reporting" OFF) +-add_library(coverage_config INTERFACE) + + # Avoids a warning about timestamps on downloaded files (prefer new policy + # if available)) +@@ -111,6 +110,8 @@ if(NANOARROW_BUNDLE) + if(NANOARROW_BUILD_TESTS) + include_directories(${CMAKE_BINARY_DIR}/amalgamation) + add_library(nanoarrow ${NANOARROW_C_TEMP}) ++ add_library(nanoarrow::nanoarrow ALIAS nanoarrow) ++ + target_compile_definitions(nanoarrow PUBLIC "$<$:NANOARROW_DEBUG>") + endif() + +@@ -120,6 +121,7 @@ if(NANOARROW_BUNDLE) + else() + add_library(nanoarrow src/nanoarrow/array.c src/nanoarrow/schema.c + src/nanoarrow/array_stream.c src/nanoarrow/utils.c) ++ add_library(nanoarrow::nanoarrow ALIAS nanoarrow) + + target_include_directories(nanoarrow + PUBLIC $ +@@ -154,13 +156,50 @@ else() + endif() + endif() + +- install(TARGETS nanoarrow DESTINATION lib) ++ install(TARGETS nanoarrow ++ DESTINATION lib ++ EXPORT nanoarrow-exports) + install(DIRECTORY src/ + DESTINATION include + FILES_MATCHING +- PATTERN "*.h") ++ PATTERN "*.h*") + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/generated/nanoarrow_config.h + DESTINATION include/nanoarrow) ++ ++ # Generate package files for the build and install trees. ++ include(CMakePackageConfigHelpers) ++ include(GNUInstallDirs) ++ ++ foreach(tree_type BUILD INSTALL) ++ if(tree_type STREQUAL "BUILD") ++ set(install_location ".") ++ else() ++ set(install_location "${CMAKE_INSTALL_LIBDIR}/cmake/nanoarrow") ++ endif() ++ ++ set(build_location "${PROJECT_BINARY_DIR}/${install_location}") ++ write_basic_package_version_file( ++ "${build_location}/nanoarrow-config-version.cmake" ++ VERSION ${nanoarrow_VERSION} ++ # After 1.0.0, we can use `SameMajorVersion` here. ++ COMPATIBILITY ExactVersion) ++ configure_package_config_file("${CMAKE_CURRENT_LIST_DIR}/cmake/config.cmake.in" ++ "${build_location}/nanoarrow-config.cmake" ++ INSTALL_DESTINATION "${install_location}") ++ ++ if(tree_type STREQUAL "BUILD") ++ export(EXPORT nanoarrow-exports ++ FILE "${build_location}/nanoarrow-targets.cmake" ++ NAMESPACE nanoarrow::) ++ ++ else() ++ install(DIRECTORY "${build_location}/" DESTINATION "${install_location}") ++ install(EXPORT nanoarrow-exports ++ DESTINATION "${install_location}" ++ FILE "nanoarrow-targets.cmake" ++ NAMESPACE nanoarrow::) ++ endif() ++ endforeach() + endif() + + # Always build integration test if building tests +@@ -215,34 +254,18 @@ if(NANOARROW_BUILD_TESTS) + src/nanoarrow/integration/c_data_integration_test.cc) + + if(NANOARROW_CODE_COVERAGE) +- target_compile_options(coverage_config INTERFACE -O0 -g --coverage) +- target_link_options(coverage_config INTERFACE --coverage) +- target_link_libraries(nanoarrow coverage_config) ++ target_compile_options(nanoarrow PUBLIC -O0 -g --coverage) ++ target_link_options(nanoarrow PUBLIC --coverage) + endif() + +- target_link_libraries(utils_test +- nanoarrow +- gtest_main +- ${NANOARROW_ARROW_TARGET} +- coverage_config) +- target_link_libraries(buffer_test nanoarrow gtest_main coverage_config) +- target_link_libraries(array_test +- nanoarrow +- gtest_main +- ${NANOARROW_ARROW_TARGET} +- coverage_config) +- target_link_libraries(schema_test +- nanoarrow +- gtest_main +- ${NANOARROW_ARROW_TARGET} +- coverage_config) +- target_link_libraries(array_stream_test nanoarrow gtest_main coverage_config) +- target_link_libraries(nanoarrow_hpp_test nanoarrow gtest_main coverage_config) +- target_link_libraries(nanoarrow_testing_test +- nanoarrow +- gtest_main +- nlohmann_json::nlohmann_json +- coverage_config) ++ target_link_libraries(utils_test nanoarrow gtest_main ${NANOARROW_ARROW_TARGET}) ++ target_link_libraries(buffer_test nanoarrow gtest_main) ++ target_link_libraries(array_test nanoarrow gtest_main ${NANOARROW_ARROW_TARGET}) ++ target_link_libraries(schema_test nanoarrow gtest_main ${NANOARROW_ARROW_TARGET}) ++ target_link_libraries(array_stream_test nanoarrow gtest_main) ++ target_link_libraries(nanoarrow_hpp_test nanoarrow gtest_main) ++ target_link_libraries(nanoarrow_testing_test nanoarrow gtest_main ++ nlohmann_json::nlohmann_json) + target_link_libraries(c_data_integration_test nanoarrow nanoarrow_c_data_integration + gtest_main) + +diff --git a/cmake/config.cmake.in b/cmake/config.cmake.in +new file mode 100644 +index 0000000..021dc31 +--- /dev/null ++++ b/cmake/config.cmake.in +@@ -0,0 +1,28 @@ ++# Licensed to the Apache Software Foundation (ASF) under one ++# or more contributor license agreements. See the NOTICE file ++# distributed with this work for additional information ++# regarding copyright ownership. The ASF licenses this file ++# to you under the Apache License, Version 2.0 (the ++# "License"); you may not use this file except in compliance ++# with the License. You may obtain a copy of the License at ++# ++# http://www.apache.org/licenses/LICENSE-2.0 ++# ++# Unless required by applicable law or agreed to in writing, ++# software distributed under the License is distributed on an ++# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY ++# KIND, either express or implied. See the License for the ++# specific language governing permissions and limitations ++# under the License. ++ ++ ++@PACKAGE_INIT@ ++ ++cmake_minimum_required(VERSION @CMAKE_MINIMUM_REQUIRED_VERSION@) ++ ++include("${CMAKE_CURRENT_LIST_DIR}/nanoarrow-targets.cmake" REQUIRED) ++include("${CMAKE_CURRENT_LIST_DIR}/nanoarrow-config-version.cmake" REQUIRED) ++ ++set(${CMAKE_FIND_PACKAGE_NAME}_CONFIG "${CMAKE_CURRENT_LIST_FILE}") ++include(FindPackageHandleStandardArgs) ++find_package_handle_standard_args(${CMAKE_FIND_PACKAGE_NAME} CONFIG_MODE) diff --git a/dependencies.yaml b/dependencies.yaml index 85f5a86d938..5bb555df818 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -6,6 +6,7 @@ files: cuda: ["11.8", "12.2"] arch: [x86_64] includes: + - build_base - build_all - build_cpp - build_wheels @@ -27,6 +28,10 @@ files: - test_python_cudf - test_python_dask_cudf - depends_on_cupy + test_static_build: + output: none + includes: + - build_base test_cpp: output: none includes: @@ -45,6 +50,7 @@ files: test_java: output: none includes: + - build_base - build_all - cuda - cuda_version @@ -75,6 +81,7 @@ files: extras: table: build-system includes: + - build_base - build_python_common - build_python_cudf py_run_cudf: @@ -144,6 +151,7 @@ files: extras: table: build-system includes: + - build_base - build_python_common py_run_cudf_kafka: output: pyproject @@ -191,12 +199,16 @@ channels: - conda-forge - nvidia dependencies: - build_all: + build_base: common: - - output_types: conda + - output_types: [conda, requirements, pyproject] packages: - &cmake_ver cmake>=3.26.4 - &ninja ninja + build_all: + common: + - output_types: conda + packages: - c-compiler - cxx-compiler - dlpack>=0.8,<1.0 @@ -254,9 +266,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - *cmake_ver - cython>=3.0.3 - - *ninja # Hard pin the patch version used during the build. This must be kept # in sync with the version pinned in get_arrow.cmake. - pyarrow==14.0.2.* From 082f6c91eb3906dbdf785348160ad5631ec91458 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 3 Apr 2024 11:27:47 -0400 Subject: [PATCH 005/842] Use offsetalator in cudf::strings::replace functions (#14824) Adds offsetalator in place of hardcoded offset size_type arrays to the strings replace functions. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14824 --- cpp/src/strings/replace/multi.cu | 236 +++---- cpp/src/strings/replace/replace.cu | 791 +++++++++-------------- cpp/src/strings/replace/replace_nulls.cu | 12 +- cpp/src/strings/replace/replace_slice.cu | 25 +- 4 files changed, 463 insertions(+), 601 deletions(-) diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index 8b5a4317b50..c93add01f69 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -14,13 +14,14 @@ * limitations under the License. */ +#include "strings/split/split.cuh" + #include #include -#include #include #include +#include #include -#include #include #include #include @@ -42,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -67,7 +69,7 @@ constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 256; * @brief Type used for holding the target position (first) and the * target index (second). */ -using target_pair = thrust::pair; +using target_pair = thrust::tuple; /** * @brief Helper functions for performing character-parallel replace @@ -75,12 +77,6 @@ using target_pair = thrust::pair; struct replace_multi_parallel_fn { __device__ char const* get_base_ptr() const { return d_strings.head(); } - __device__ size_type const* get_offsets_ptr() const - { - return d_strings.child(strings_column_view::offsets_column_index).data() + - d_strings.offset(); - } - __device__ string_view const get_string(size_type idx) const { return d_strings.element(idx); @@ -100,11 +96,12 @@ struct replace_multi_parallel_fn { * @param idx Index of the byte position in the chars column * @param chars_bytes Number of bytes in the chars column */ - __device__ thrust::optional has_target(size_type idx, size_type chars_bytes) const + __device__ size_type target_index(int64_t idx, int64_t chars_bytes) const { - auto const d_offsets = get_offsets_ptr(); + auto const d_offsets = d_strings_offsets; auto const d_chars = get_base_ptr() + d_offsets[0] + idx; size_type str_idx = -1; + string_view d_str{}; for (std::size_t t = 0; t < d_targets.size(); ++t) { auto const d_tgt = d_targets[t]; if (!d_tgt.empty() && (idx + d_tgt.size_bytes() <= chars_bytes) && @@ -113,12 +110,24 @@ struct replace_multi_parallel_fn { auto const idx_itr = thrust::upper_bound(thrust::seq, d_offsets, d_offsets + d_strings.size(), idx); str_idx = thrust::distance(d_offsets, idx_itr) - 1; + d_str = get_string(str_idx - d_offsets[0]); } - auto const d_str = get_string(str_idx - d_offsets[0]); if ((d_chars + d_tgt.size_bytes()) <= (d_str.data() + d_str.size_bytes())) { return t; } } } - return thrust::nullopt; + return -1; + } + + __device__ bool has_target(int64_t idx, int64_t chars_bytes) const + { + auto const d_chars = get_base_ptr() + d_strings_offsets[0] + idx; + for (auto& d_tgt : d_targets) { + if (!d_tgt.empty() && (idx + d_tgt.size_bytes() <= chars_bytes) && + (d_tgt.compare(d_chars, d_tgt.size_bytes()) == 0)) { + return true; + } + } + return false; } /** @@ -133,28 +142,32 @@ struct replace_multi_parallel_fn { * @return Number of substrings resulting from the replace operations on this row */ __device__ size_type count_strings(size_type idx, - target_pair const* d_positions, - size_type const* d_targets_offsets) const + int64_t const* d_positions, + size_type const* d_indices, + cudf::detail::input_offsetalator d_targets_offsets) const { if (!is_valid(idx)) { return 0; } - auto const d_str = get_string(idx); - auto const d_str_end = d_str.data() + d_str.size_bytes(); - auto const base_ptr = get_base_ptr(); - auto const targets_positions = cudf::device_span( - d_positions + d_targets_offsets[idx], d_targets_offsets[idx + 1] - d_targets_offsets[idx]); + auto const d_str = get_string(idx); + auto const d_str_end = d_str.data() + d_str.size_bytes(); + auto const base_ptr = get_base_ptr(); + + auto const target_offset = d_targets_offsets[idx]; + auto const targets_size = static_cast(d_targets_offsets[idx + 1] - target_offset); + auto const positions = d_positions + target_offset; + auto const indices = d_indices + target_offset; size_type count = 1; // always at least one string auto str_ptr = d_str.data(); - for (auto d_pair : targets_positions) { - auto const d_pos = d_pair.first; - auto const d_tgt = d_targets[d_pair.second]; - auto const tgt_ptr = base_ptr + d_pos; + for (std::size_t i = 0; i < targets_size; ++i) { + auto const tgt_idx = indices[i]; + auto const d_tgt = d_targets[tgt_idx]; + auto const tgt_ptr = base_ptr + positions[i]; if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) { auto const keep_size = static_cast(thrust::distance(str_ptr, tgt_ptr)); if (keep_size > 0) { count++; } // don't bother counting empty strings - auto const d_repl = get_replacement_string(d_pair.second); + auto const d_repl = get_replacement_string(tgt_idx); if (!d_repl.empty()) { count++; } str_ptr += keep_size + d_tgt.size_bytes(); @@ -182,9 +195,10 @@ struct replace_multi_parallel_fn { * @return The size in bytes of the output string for this row */ __device__ size_type get_strings(size_type idx, - size_type const* d_offsets, - target_pair const* d_positions, - size_type const* d_targets_offsets, + cudf::detail::input_offsetalator const d_offsets, + int64_t const* d_positions, + size_type const* d_indices, + cudf::detail::input_offsetalator d_targets_offsets, string_index_pair* d_all_strings) const { if (!is_valid(idx)) { return 0; } @@ -194,22 +208,24 @@ struct replace_multi_parallel_fn { auto const d_str_end = d_str.data() + d_str.size_bytes(); auto const base_ptr = get_base_ptr(); - auto const targets_positions = cudf::device_span( - d_positions + d_targets_offsets[idx], d_targets_offsets[idx + 1] - d_targets_offsets[idx]); + auto const target_offset = d_targets_offsets[idx]; + auto const targets_size = static_cast(d_targets_offsets[idx + 1] - target_offset); + auto const positions = d_positions + target_offset; + auto const indices = d_indices + target_offset; size_type output_idx = 0; size_type output_size = 0; auto str_ptr = d_str.data(); - for (auto d_pair : targets_positions) { - auto const d_pos = d_pair.first; - auto const d_tgt = d_targets[d_pair.second]; - auto const tgt_ptr = base_ptr + d_pos; + for (std::size_t i = 0; i < targets_size; ++i) { + auto const tgt_idx = indices[i]; + auto const d_tgt = d_targets[tgt_idx]; + auto const tgt_ptr = base_ptr + positions[i]; if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) { auto const keep_size = static_cast(thrust::distance(str_ptr, tgt_ptr)); if (keep_size > 0) { d_output[output_idx++] = string_index_pair{str_ptr, keep_size}; } output_size += keep_size; - auto const d_repl = get_replacement_string(d_pair.second); + auto const d_repl = get_replacement_string(tgt_idx); if (!d_repl.empty()) { d_output[output_idx++] = string_index_pair{d_repl.data(), d_repl.size_bytes()}; } @@ -228,14 +244,19 @@ struct replace_multi_parallel_fn { } replace_multi_parallel_fn(column_device_view const& d_strings, + cudf::detail::input_offsetalator d_strings_offsets, device_span d_targets, device_span d_replacements) - : d_strings(d_strings), d_targets{d_targets}, d_replacements{d_replacements} + : d_strings(d_strings), + d_strings_offsets(d_strings_offsets), + d_targets{d_targets}, + d_replacements{d_replacements} { } protected: column_device_view d_strings; + cudf::detail::input_offsetalator d_strings_offsets; device_span d_targets; device_span d_replacements; }; @@ -247,17 +268,16 @@ struct replace_multi_parallel_fn { * (this happens sometimes when passing device lambdas to thrust algorithms) */ struct pair_generator { - __device__ target_pair operator()(int idx) const + __device__ target_pair operator()(int64_t idx) const { - auto pos = fn.has_target(idx, chars_bytes); - return target_pair{idx, pos.value_or(-1)}; + return thrust::make_tuple(idx, fn.target_index(idx, chars_bytes)); } replace_multi_parallel_fn fn; - size_type chars_bytes; + int64_t chars_bytes; }; struct copy_if_fn { - __device__ bool operator()(target_pair pos) { return pos.second >= 0; } + __device__ bool operator()(target_pair pos) { return thrust::get<1>(pos) >= 0; } }; std::unique_ptr replace_character_parallel(strings_column_view const& input, @@ -270,105 +290,91 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in auto const strings_count = input.size(); auto const chars_bytes = - cudf::detail::get_value(input.offsets(), input.offset() + strings_count, stream) - - cudf::detail::get_value(input.offsets(), input.offset(), stream); + get_offset_value(input.offsets(), input.offset() + strings_count, stream) - + get_offset_value(input.offsets(), input.offset(), stream); auto d_targets = create_string_vector_from_column(targets, stream, rmm::mr::get_current_device_resource()); auto d_replacements = create_string_vector_from_column(repls, stream, rmm::mr::get_current_device_resource()); - replace_multi_parallel_fn fn{*d_strings, d_targets, d_replacements}; + replace_multi_parallel_fn fn{ + *d_strings, + cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()), + d_targets, + d_replacements, + }; + + // Count the number of targets in the entire column. + // Note this may over-count in the case where a target spans adjacent strings. + auto target_count = thrust::count_if( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(chars_bytes), + [fn, chars_bytes] __device__(int64_t idx) { return fn.has_target(idx, chars_bytes); }); - // count the number of targets in the entire column - auto const target_count = thrust::count_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(chars_bytes), - [fn, chars_bytes] __device__(size_type idx) { - return fn.has_target(idx, chars_bytes).has_value(); - }); // Create a vector of every target position in the chars column. - // These may include overlapping targets which will be resolved later. - auto targets_positions = rmm::device_uvector(target_count, stream); + // These may also include overlapping targets which will be resolved later. + auto targets_positions = rmm::device_uvector(target_count, stream); + auto targets_indices = rmm::device_uvector(target_count, stream); + + // cudf::detail::make_counting_transform_iterator hardcodes size_type + auto const copy_itr = thrust::make_transform_iterator(thrust::counting_iterator(0), + pair_generator{fn, chars_bytes}); + auto const out_itr = thrust::make_zip_iterator( + thrust::make_tuple(targets_positions.begin(), targets_indices.begin())); + auto const copy_end = + cudf::detail::copy_if_safe(copy_itr, copy_itr + chars_bytes, out_itr, copy_if_fn{}, stream); + + // adjust target count since the copy-if may have eliminated some invalid targets + target_count = std::min(static_cast(std::distance(out_itr, copy_end)), target_count); + targets_positions.resize(target_count, stream); + targets_indices.resize(target_count, stream); auto d_positions = targets_positions.data(); - - auto const copy_itr = - cudf::detail::make_counting_transform_iterator(0, pair_generator{fn, chars_bytes}); - auto const copy_end = thrust::copy_if( - rmm::exec_policy(stream), copy_itr, copy_itr + chars_bytes, d_positions, copy_if_fn{}); + auto d_targets_indices = targets_indices.data(); // create a vector of offsets to each string's set of target positions - auto const targets_offsets = [&] { - auto string_indices = rmm::device_uvector(target_count, stream); - - auto const pos_itr = cudf::detail::make_counting_transform_iterator( - 0, cuda::proclaim_return_type([d_positions] __device__(auto idx) -> int64_t { - return d_positions[idx].first; - })); - auto pos_count = std::distance(d_positions, copy_end); - - auto begin = - cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); - auto end = begin + input.offsets().size(); - thrust::upper_bound( - rmm::exec_policy(stream), begin, end, pos_itr, pos_itr + pos_count, string_indices.begin()); - - // compute offsets per string - auto targets_offsets = rmm::device_uvector(strings_count + 1, stream); - auto d_targets_offsets = targets_offsets.data(); - - // memset to zero-out the target counts for any null-entries or strings with no targets - thrust::uninitialized_fill( - rmm::exec_policy(stream), targets_offsets.begin(), targets_offsets.end(), 0); - - // next, count the number of targets per string - auto d_string_indices = string_indices.data(); - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - target_count, - [d_string_indices, d_targets_offsets] __device__(size_type idx) { - auto const str_idx = d_string_indices[idx] - 1; - atomicAdd(d_targets_offsets + str_idx, 1); - }); - // finally, convert the counts into offsets - thrust::exclusive_scan(rmm::exec_policy(stream), - targets_offsets.begin(), - targets_offsets.end(), - targets_offsets.begin()); - return targets_offsets; - }(); - auto const d_targets_offsets = targets_offsets.data(); + auto const targets_offsets = create_offsets_from_positions( + input, targets_positions, stream, rmm::mr::get_current_device_resource()); + auto const d_targets_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(targets_offsets->view()); // compute the number of string segments produced by replace in each string auto counts = rmm::device_uvector(strings_count, stream); - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), + thrust::transform(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(strings_count), counts.begin(), cuda::proclaim_return_type( - [fn, d_positions, d_targets_offsets] __device__(size_type idx) -> size_type { - return fn.count_strings(idx, d_positions, d_targets_offsets); + [fn, d_positions, d_targets_indices, d_targets_offsets] __device__( + size_type idx) -> size_type { + return fn.count_strings( + idx, d_positions, d_targets_indices, d_targets_offsets); })); // create offsets from the counts - auto offsets = - std::get<0>(cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr)); - auto const total_strings = - cudf::detail::get_value(offsets->view(), strings_count, stream); - auto const d_strings_offsets = offsets->view().data(); + auto [offsets, total_strings] = + cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr); + auto const d_strings_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(offsets->view()); // build a vector of all the positions for all the strings auto indices = rmm::device_uvector(total_strings, stream); auto d_indices = indices.data(); auto d_sizes = counts.data(); // reusing this vector to hold output sizes now thrust::for_each_n( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), strings_count, - [fn, d_strings_offsets, d_positions, d_targets_offsets, d_indices, d_sizes] __device__( - size_type idx) { - d_sizes[idx] = - fn.get_strings(idx, d_strings_offsets, d_positions, d_targets_offsets, d_indices); + [fn, + d_strings_offsets, + d_positions, + d_targets_indices, + d_targets_offsets, + d_indices, + d_sizes] __device__(size_type idx) { + d_sizes[idx] = fn.get_strings( + idx, d_strings_offsets, d_positions, d_targets_indices, d_targets_offsets, d_indices); }); // use this utility to gather the string parts into a contiguous chars column @@ -376,8 +382,8 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in auto chars_data = chars->release().data; // create offsets from the sizes - offsets = - std::get<0>(cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr)); + offsets = std::get<0>( + cudf::strings::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr)); // build the strings columns from the chars and offsets return make_strings_column(strings_count, diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu index 1f752f543d0..2c548f2f7cd 100644 --- a/cpp/src/strings/replace/replace.cu +++ b/cpp/src/strings/replace/replace.cu @@ -14,20 +14,21 @@ * limitations under the License. */ +#include "strings/split/split.cuh" + #include #include -#include #include #include -#include +#include #include #include +#include #include #include #include #include #include -#include #include #include @@ -39,11 +40,7 @@ #include #include #include -#include #include -#include -#include -#include #include namespace cudf { @@ -52,505 +49,375 @@ namespace detail { namespace { /** - * @brief Average string byte-length threshold for deciding character-level vs row-level parallel - * algorithm. + * @brief Threshold to decide on using string or character-parallel functions. + * + * If the average byte length of a string in a column exceeds this value then + * the character-parallel function is used. + * Otherwise, a regular string-parallel function is used. * - * This value was determined by running the replace string scalar benchmark against different - * power-of-2 string lengths and observing the point at which the performance only improved for - * all trials. + * This value was found using the replace-multi benchmark results using an + * RTX A6000. */ -constexpr size_type BYTES_PER_VALID_ROW_THRESHOLD = 64; +constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 256; /** - * @brief Function logic for the row-level parallelism replace API. - * - * This will perform a replace operation on each string. + * @brief Helper functions for performing character-parallel replace */ -struct replace_row_parallel_fn { - column_device_view const d_strings; - string_view const d_target; - string_view const d_repl; - int32_t const max_repl; - int32_t* d_offsets{}; - char* d_chars{}; +struct replace_parallel_chars_fn { + __device__ inline char const* get_base_ptr() const { return d_strings.head(); } - __device__ void operator()(size_type idx) + __device__ inline string_view const get_string(size_type idx) const { - if (d_strings.is_null(idx)) { - if (!d_chars) d_offsets[idx] = 0; - return; - } - auto const d_str = d_strings.element(idx); - char const* in_ptr = d_str.data(); - - char* out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr; - auto max_n = (max_repl < 0) ? d_str.length() : max_repl; - auto bytes = d_str.size_bytes(); - auto position = d_str.find(d_target); - - size_type last_pos = 0; - while ((position != string_view::npos) && (max_n > 0)) { - if (out_ptr) { - auto const curr_pos = d_str.byte_offset(position); - out_ptr = copy_and_increment(out_ptr, in_ptr + last_pos, curr_pos - last_pos); // copy left - out_ptr = copy_string(out_ptr, d_repl); // copy repl - last_pos = curr_pos + d_target.size_bytes(); - } else { - bytes += d_repl.size_bytes() - d_target.size_bytes(); - } - position = d_str.find(d_target, position + d_target.length()); - --max_n; - } - if (out_ptr) // copy whats left (or right depending on your point of view) - memcpy(out_ptr, in_ptr + last_pos, d_str.size_bytes() - last_pos); - else - d_offsets[idx] = bytes; + return d_strings.element(idx); } -}; -/** - * @brief Functor for detecting falsely-overlapped target positions. - * - * This functor examines target positions that have been flagged as potentially overlapped by - * a previous target position and identifies the overlaps that are false. A false overlap can occur - * when a target position is overlapped by another target position that is itself overlapped. - * - * For example, a target string of "+++" and string to search of "++++++" will generate 4 potential - * target positions at char offsets 0 through 3. The targets at offsets 1, 2, and 3 will be flagged - * as potential overlaps since a prior target position is within range of the target string length. - * The targets at offset 1 and 2 are true overlaps, since the footprint of the valid target at - * offset 0 overlaps with them. The target at offset 3 is not truly overlapped because it is only - * overlapped by invalid targets, targets that were themselves overlapped by a valid target. - */ -struct target_false_overlap_filter_fn { - size_type const* const d_overlap_pos_indices{}; - size_type const* const d_target_positions{}; - size_type const target_size{}; + __device__ inline bool is_valid(size_type idx) const { return d_strings.is_valid(idx); } - __device__ bool operator()(size_type overlap_idx) const + /** + * @brief Returns true if the target string is found at the given byte position + * in the input strings column and is legally within a string row + * + * @param idx Index of the byte position in the chars column + */ + __device__ bool is_target_within_row(int64_t idx) const { - if (overlap_idx == 0) { - // The first overlap has no prior overlap to chain, so it should be kept as an overlap. - return false; + auto const d_offsets = d_strings_offsets; + auto const d_chars = get_base_ptr() + idx; + auto const d_tgt = d_target; + auto const chars_end = chars_bytes + d_offsets[0]; + if (!d_tgt.empty() && (idx + d_tgt.size_bytes() <= chars_end) && + (d_tgt.compare(d_chars, d_tgt.size_bytes()) == 0)) { + auto const idx_itr = + thrust::upper_bound(thrust::seq, d_offsets, d_offsets + d_strings.size(), idx); + auto str_idx = static_cast(thrust::distance(d_offsets, idx_itr) - 1); + auto d_str = get_string(str_idx); + if ((d_chars + d_tgt.size_bytes()) <= (d_str.data() + d_str.size_bytes())) { return true; } } + return false; + } - size_type const this_pos_idx = d_overlap_pos_indices[overlap_idx]; - - // Searching backwards for the first target position index of an overlap that is not adjacent - // to its overlap predecessor. The result will be the first overlap in this chain of overlaps. - size_type first_overlap_idx = overlap_idx; - size_type first_pos_idx = this_pos_idx; - while (first_overlap_idx > 0) { - size_type prev_pos_idx = d_overlap_pos_indices[--first_overlap_idx]; - if (prev_pos_idx + 1 != first_pos_idx) { break; } - first_pos_idx = prev_pos_idx; - } + /** + * @brief Returns true if the target string found at the given byte position + * + * @param idx Index of the byte position in the chars column + */ + __device__ bool has_target(int64_t idx) const + { + auto const d_chars = get_base_ptr() + d_strings_offsets[0] + idx; + return (!d_target.empty() && (idx + d_target.size_bytes() <= chars_bytes) && + (d_target.compare(d_chars, d_target.size_bytes()) == 0)); + } - // The prior target position to the first overlapped position in the chain is a valid target. - size_type valid_pos_idx = first_pos_idx - 1; - size_type valid_pos = d_target_positions[valid_pos_idx]; - - // Walk forward from this valid target. Any targets within the range of this valid one are true - // overlaps. The first overlap beyond the range of this valid target is another valid target, - // as it was falsely overlapped by a target that was itself overlapped. Repeat until we get to - // the overlapped position being queried by this call. - while (valid_pos_idx < this_pos_idx) { - size_type next_pos_idx = valid_pos_idx + 1; - size_type next_pos = d_target_positions[next_pos_idx]; - // Every target position within the range of a valid target position is a true overlap. - while (next_pos < valid_pos + target_size) { - if (next_pos_idx == this_pos_idx) { return false; } - next_pos = d_target_positions[++next_pos_idx]; + /** + * @brief Count the number of strings that will be produced by the replace + * + * This includes segments of the string that are not replaced as well as those + * that are replaced. + * + * @param idx Index of the row in d_strings to be processed + * @param d_positions Positions of the targets found in the chars column + * @param d_targets_offsets Offsets identify which target positions go with the current string + * @return Number of substrings resulting from the replace operations on this row + */ + __device__ size_type count_strings(size_type idx, + int64_t const* d_positions, + cudf::detail::input_offsetalator d_targets_offsets) const + { + if (!is_valid(idx)) { return 0; } + + auto const d_str = get_string(idx); + auto const d_str_end = d_str.data() + d_str.size_bytes(); + auto const base_ptr = get_base_ptr(); + auto max_n = (maxrepl < 0) ? d_str.length() : maxrepl; + + auto const target_offset = d_targets_offsets[idx]; + auto const targets_size = static_cast(d_targets_offsets[idx + 1] - target_offset); + auto const positions = d_positions + target_offset; + + size_type count = 1; // always at least one string + auto str_ptr = d_str.data(); + for (std::size_t i = 0; (i < targets_size) && (max_n > 0); ++i) { + auto const tgt_ptr = base_ptr + positions[i]; + if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) { + auto const keep_size = static_cast(thrust::distance(str_ptr, tgt_ptr)); + if (keep_size > 0) { count++; } // don't bother counting empty strings + if (!d_replacement.empty()) { count++; } + str_ptr += keep_size + d_target.size_bytes(); + --max_n; } - valid_pos_idx = next_pos_idx; - valid_pos = next_pos; } - - // This was overlapped only by false overlaps and therefore is a valid target. - return true; + return count; } -}; -/** - * @brief Functor for replacing each target string with the replacement string. - * - * This will perform a replace operation at each target position. - */ -struct target_replacer_fn { - device_span const d_target_positions; - char const* const d_in_chars{}; - char* const d_out_chars{}; - size_type const target_size{}; - string_view const d_repl; - int32_t const in_char_offset = 0; - - __device__ void operator()(size_type input_idx) const + /** + * @brief Retrieve the strings for each row + * + * This will return string segments as string_index_pair objects for + * parts of the string that are not replaced interlaced with the + * appropriate replacement string where replacement targets are found. + * + * This function is called only once to produce both the string_index_pair objects + * and the output row size in bytes. + * + * @param idx Index of the row in d_strings + * @param d_offsets Offsets to identify where to store the results of the replace for this string + * @param d_positions The target positions found in the chars column + * @param d_targets_offsets The offsets to identify which target positions go with this string + * @param d_all_strings The output of all the produced string segments + * @return The size in bytes of the output string for this row + */ + __device__ size_type get_strings(size_type idx, + cudf::detail::input_offsetalator const d_offsets, + int64_t const* d_positions, + cudf::detail::input_offsetalator d_targets_offsets, + string_index_pair* d_all_strings) const { - // Calculate the adjustment from input index to output index for each prior target position. - auto const repl_size = d_repl.size_bytes(); - auto const idx_delta_per_pos = repl_size - target_size; - - // determine the number of target positions at or before this character position - size_type const* next_target_pos_ptr = thrust::upper_bound( - thrust::seq, d_target_positions.begin(), d_target_positions.end(), input_idx); - size_type const num_prev_targets = next_target_pos_ptr - d_target_positions.data(); - size_type output_idx = input_idx - in_char_offset + idx_delta_per_pos * num_prev_targets; - - if (num_prev_targets == 0) { - // not within a target string - d_out_chars[output_idx] = d_in_chars[input_idx]; - } else { - // check if this input position is within a target string - size_type const prev_target_pos = *(next_target_pos_ptr - 1); - size_type target_idx = input_idx - prev_target_pos; - if (target_idx < target_size) { - // within the target string, so the original calculation was off by one target string - output_idx -= idx_delta_per_pos; - - // Copy the corresponding byte from the replacement string. If the replacement string is - // larger than the target string then the thread reading the last target byte is - // responsible for copying the remainder of the replacement string. - if (target_idx < repl_size) { - d_out_chars[output_idx++] = d_repl.data()[target_idx++]; - if (target_idx == target_size) { - memcpy(d_out_chars + output_idx, d_repl.data() + target_idx, repl_size - target_idx); - } + if (!is_valid(idx)) { return 0; } + + auto const d_output = d_all_strings + d_offsets[idx]; + auto const d_str = get_string(idx); + auto const d_str_end = d_str.data() + d_str.size_bytes(); + auto const base_ptr = get_base_ptr(); + auto max_n = (maxrepl < 0) ? d_str.length() : maxrepl; + + auto const target_offset = d_targets_offsets[idx]; + auto const targets_size = static_cast(d_targets_offsets[idx + 1] - target_offset); + auto const positions = d_positions + target_offset; + + size_type output_idx = 0; + size_type output_size = 0; + auto str_ptr = d_str.data(); + for (std::size_t i = 0; (i < targets_size) && (max_n > 0); ++i) { + auto const tgt_ptr = base_ptr + positions[i]; + if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) { + auto const keep_size = static_cast(thrust::distance(str_ptr, tgt_ptr)); + if (keep_size > 0) { d_output[output_idx++] = string_index_pair{str_ptr, keep_size}; } + output_size += keep_size; + + if (!d_replacement.empty()) { + d_output[output_idx++] = + string_index_pair{d_replacement.data(), d_replacement.size_bytes()}; } - } else { - // not within a target string - d_out_chars[output_idx] = d_in_chars[input_idx]; + output_size += d_replacement.size_bytes(); + + str_ptr += keep_size + d_target.size_bytes(); + --max_n; } } + // include any leftover parts of the string + if (str_ptr <= d_str_end) { + auto const left_size = static_cast(thrust::distance(str_ptr, d_str_end)); + d_output[output_idx] = string_index_pair{str_ptr, left_size}; + output_size += left_size; + } + return output_size; } + + replace_parallel_chars_fn(column_device_view const& d_strings, + cudf::detail::input_offsetalator d_strings_offsets, + int64_t chars_bytes, + string_view d_target, + string_view d_replacement, + cudf::size_type maxrepl) + : d_strings(d_strings), + d_strings_offsets(d_strings_offsets), + chars_bytes(chars_bytes), + d_target{d_target}, + d_replacement{d_replacement}, + maxrepl(maxrepl) + { + } + + protected: + column_device_view d_strings; + cudf::detail::input_offsetalator d_strings_offsets; + int64_t chars_bytes; + string_view d_target; + string_view d_replacement; + cudf::size_type maxrepl; }; -/** - * @brief Filter target positions that are overlapped by other, valid target positions. - * - * This performs an in-place modification of the target positions to remove any target positions - * that are overlapped by other, valid target positions. For example, if the target string is "++" - * and the string to search is "+++" then there will be two potential targets at character offsets - * 0 and 1. The target at offset 0 is valid and overlaps the target at offset 1, invalidating the - * target at offset 1. - * - * @param[in,out] d_target_positions Potential target positions to filter in-place. - * @param[in] target_count Number of potential target positions. - * @param[in] target_size Size of the target string in bytes. - * @param[in] stream CUDA stream to use for device operations. - * @return Number of target positions after filtering. - */ -size_type filter_overlap_target_positions(size_type* d_target_positions, - size_type target_count, - size_type target_size, - rmm::cuda_stream_view stream) +std::unique_ptr replace_character_parallel(strings_column_view const& input, + string_view const& d_target, + string_view const& d_replacement, + cudf::size_type maxrepl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - auto overlap_detector = [d_target_positions, target_size] __device__(size_type pos_idx) -> bool { - return (pos_idx > 0) - ? d_target_positions[pos_idx] - d_target_positions[pos_idx - 1] < target_size - : false; - }; - - // count the potential number of overlapped target positions - size_type overlap_count = - thrust::count_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(target_count), - overlap_detector); - if (overlap_count == 0) { return target_count; } - - // create a vector indexing the potential overlapped target positions - rmm::device_uvector potential_overlapped_pos_indices(overlap_count, stream); - auto d_potential_overlapped_pos_indices = potential_overlapped_pos_indices.data(); - thrust::copy_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(target_count), - d_potential_overlapped_pos_indices, - overlap_detector); - - // filter out the false overlaps that are actually valid - rmm::device_uvector overlapped_pos_indices(overlap_count, stream); - auto d_overlapped_pos_indices = overlapped_pos_indices.data(); - auto overlap_end = - thrust::remove_copy_if(rmm::exec_policy(stream), - d_potential_overlapped_pos_indices, - d_potential_overlapped_pos_indices + overlap_count, - thrust::make_counting_iterator(0), - d_overlapped_pos_indices, - target_false_overlap_filter_fn{ - d_potential_overlapped_pos_indices, d_target_positions, target_size}); - overlap_count = cudf::distance(d_overlapped_pos_indices, overlap_end); - - // In-place remove any target positions that are overlapped by valid target positions - auto target_pos_end = thrust::remove_if( - rmm::exec_policy(stream), - d_target_positions, - d_target_positions + target_count, + auto d_strings = column_device_view::create(input.parent(), stream); + + auto const strings_count = input.size(); + auto const chars_offset = get_offset_value(input.offsets(), input.offset(), stream); + auto const chars_bytes = + get_offset_value(input.offsets(), input.offset() + strings_count, stream) - chars_offset; + + auto const offsets_begin = + cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); + + replace_parallel_chars_fn fn{ + *d_strings, offsets_begin, chars_bytes, d_target, d_replacement, maxrepl}; + + // Count the number of targets in the entire column. + // Note this may over-count in the case where a target spans adjacent strings. + auto target_count = thrust::count_if(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(chars_bytes), + [fn] __device__(int64_t idx) { return fn.has_target(idx); }); + + // Create a vector of every target position in the chars column. + // These may also include overlapping targets which will be resolved later. + auto targets_positions = rmm::device_uvector(target_count, stream); + auto const copy_itr = thrust::counting_iterator(chars_offset); + auto const copy_end = cudf::detail::copy_if_safe( + copy_itr, + copy_itr + chars_bytes + chars_offset, + targets_positions.begin(), + [fn] __device__(int64_t idx) { return fn.is_target_within_row(idx); }, + stream); + + // adjust target count since the copy-if may have eliminated some invalid targets + target_count = std::min(std::distance(targets_positions.begin(), copy_end), target_count); + targets_positions.resize(target_count, stream); + auto d_positions = targets_positions.data(); + + // create a vector of offsets to each string's set of target positions + auto const targets_offsets = create_offsets_from_positions( + input, targets_positions, stream, rmm::mr::get_current_device_resource()); + auto const d_targets_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(targets_offsets->view()); + + // compute the number of string segments produced by replace in each string + auto counts = rmm::device_uvector(strings_count, stream); + thrust::transform(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(strings_count), + counts.begin(), + cuda::proclaim_return_type( + [fn, d_positions, d_targets_offsets] __device__(size_type idx) -> size_type { + return fn.count_strings(idx, d_positions, d_targets_offsets); + })); + + // create offsets from the counts + auto [offsets, total_strings] = + cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr); + auto const d_strings_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(offsets->view()); + + // build a vector of all the positions for all the strings + auto indices = rmm::device_uvector(total_strings, stream); + auto d_indices = indices.data(); + auto d_sizes = counts.data(); // reusing this vector to hold output sizes now + thrust::for_each_n( + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), - [d_overlapped_pos_indices, overlap_count] __device__(size_type target_position_idx) -> bool { - return thrust::binary_search(thrust::seq, - d_overlapped_pos_indices, - d_overlapped_pos_indices + overlap_count, - target_position_idx); + strings_count, + [fn, d_strings_offsets, d_positions, d_targets_offsets, d_indices, d_sizes] __device__( + size_type idx) { + d_sizes[idx] = + fn.get_strings(idx, d_strings_offsets, d_positions, d_targets_offsets, d_indices); }); - return cudf::distance(d_target_positions, target_pos_end); -} -/** - * @brief Filter target positions to remove any invalid target positions. - * - * This performs an in-place modification of the target positions to remove any target positions - * that are invalid, either by the target string overlapping a row boundary or being overlapped by - * another valid target string. - * - * @param[in,out] target_positions Potential target positions to filter in-place. - * @param[in] d_offsets_span Memory range encompassing the string column offsets. - * @param[in] target_size Size of the target string in bytes. - * @param[in] stream CUDA stream to use for device operations. - * @return Number of target positions after filtering. - */ -size_type filter_false_target_positions(rmm::device_uvector& target_positions, - device_span d_offsets_span, - size_type target_size, - rmm::cuda_stream_view stream) -{ - // In-place remove any positions for target strings that crossed string boundaries. - auto d_target_positions = target_positions.data(); - auto target_pos_end = - thrust::remove_if(rmm::exec_policy(stream), - d_target_positions, - d_target_positions + target_positions.size(), - [d_offsets_span, target_size] __device__(size_type target_pos) -> bool { - // find the end of the string containing the start of this target - size_type const* offset_ptr = thrust::upper_bound( - thrust::seq, d_offsets_span.begin(), d_offsets_span.end(), target_pos); - return target_pos + target_size > *offset_ptr; - }); - auto const target_count = cudf::distance(d_target_positions, target_pos_end); - if (target_count == 0) { return 0; } - - // Filter out target positions that are the result of overlapping target matches. - return (target_count > 1) - ? filter_overlap_target_positions(d_target_positions, target_count, target_size, stream) - : target_count; -} + // use this utility to gather the string parts into a contiguous chars column + auto chars = make_strings_column(indices.begin(), indices.end(), stream, mr); + auto chars_data = chars->release().data; -/** - * @brief Filter target positions beyond the maximum target replacements per row limit. - * - * This performs an in-place modification of the target positions to remove any target positions - * corresponding to targets that should not be replaced due to the maximum target replacement per - * row limit. - * - * @param[in,out] target_positions Target positions to filter in-place. - * @param[in] target_count Number of target positions. - * @param[in] d_offsets_span Memory range encompassing the string column offsets. - * @param[in] max_repl_per_row Maximum target replacements per row limit. - * @param[in] stream CUDA stream to use for device operations. - * @return Number of target positions after filtering. - */ -size_type filter_maxrepl_target_positions(size_type* d_target_positions, - size_type target_count, - device_span d_offsets_span, - size_type max_repl_per_row, - rmm::cuda_stream_view stream) -{ - auto pos_to_row_fn = cuda::proclaim_return_type( - [d_offsets_span] __device__(size_type target_pos) -> size_type { - auto upper_bound = - thrust::upper_bound(thrust::seq, d_offsets_span.begin(), d_offsets_span.end(), target_pos); - return thrust::distance(d_offsets_span.begin(), upper_bound); - }); + // create offsets from the sizes + offsets = std::get<0>( + cudf::strings::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr)); - // compute the match count per row for each target position - rmm::device_uvector match_counts(target_count, stream); - auto d_match_counts = match_counts.data(); - thrust::inclusive_scan_by_key( - rmm::exec_policy(stream), - thrust::make_transform_iterator(d_target_positions, pos_to_row_fn), - thrust::make_transform_iterator(d_target_positions + target_count, pos_to_row_fn), - thrust::make_constant_iterator(1), - d_match_counts); - - // In-place remove any positions that exceed the per-row match limit - auto target_pos_end = - thrust::remove_if(rmm::exec_policy(stream), - d_target_positions, - d_target_positions + target_count, - d_match_counts, - [max_repl_per_row] __device__(size_type match_count) -> bool { - return match_count > max_repl_per_row; - }); - - return cudf::distance(d_target_positions, target_pos_end); + // build the strings columns from the chars and offsets + return make_strings_column(strings_count, + std::move(offsets), + std::move(chars_data.release()[0]), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); } /** - * @brief Scalar string replacement using a character-level parallel algorithm. - * - * Replaces occurrences of the target string with the replacement string using an algorithm with - * character-level parallelism. This algorithm will perform well when the strings in the string - * column are relatively long. - * @see BYTES_PER_VALID_ROW_THRESHOLD + * @brief Function logic for the replace_string_parallel * - * @param strings String column to search for target strings. - * @param chars_start Offset of the first character in the string column. - * @param chars_end Offset beyond the last character in the string column to search. - * @param d_target String to search for within the string column. - * @param d_repl Replacement string if target string is found. - * @param maxrepl Maximum times to replace if target appears multiple times in a string. - * @param stream CUDA stream to use for device operations - * @param mr Device memory resource used to allocate the returned column's device memory - * @return New strings column. + * Performs the multi-replace operation with a thread per string. + * This performs best on smaller strings. @see AVG_CHAR_BYTES_THRESHOLD */ -std::unique_ptr replace_char_parallel(strings_column_view const& strings, - size_type chars_start, - size_type chars_end, - string_view const& d_target, - string_view const& d_repl, - int32_t maxrepl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto const strings_count = strings.size(); - auto const offset_count = strings_count + 1; - auto const d_offsets = strings.offsets().begin() + strings.offset(); // TODO: PR 14824 - auto const d_in_chars = strings.chars_begin(stream); - auto const chars_bytes = chars_end - chars_start; - auto const target_size = d_target.size_bytes(); - - // detect a target match at the specified byte position - device_span const d_chars_span(d_in_chars, chars_end); - auto target_detector = [d_chars_span, d_target] __device__(size_type char_idx) { - auto target_size = d_target.size_bytes(); - auto target_ptr = d_chars_span.begin() + char_idx; - return target_ptr + target_size <= d_chars_span.end() && - d_target.compare(target_ptr, target_size) == 0; - }; - - // Count target string matches across all character positions, ignoring string boundaries and - // overlapping target strings. This may produce false-positives. - size_type target_count = thrust::count_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(chars_start), - thrust::make_counting_iterator(chars_end), - target_detector); - if (target_count == 0) { - // nothing to replace, copy the input column - return std::make_unique(strings.parent(), stream, mr); - } +struct replace_fn { + column_device_view const d_strings; + string_view d_target; + string_view d_replacement; + cudf::size_type maxrepl; + cudf::size_type* d_offsets{}; + char* d_chars{}; - // create a vector of the potential target match positions - rmm::device_uvector target_positions(target_count, stream); - auto d_target_positions = target_positions.data(); - thrust::copy_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(chars_start), - thrust::make_counting_iterator(chars_end), - d_target_positions, - target_detector); - - device_span d_offsets_span(d_offsets, offset_count); - if (target_size > 1) { - target_count = - filter_false_target_positions(target_positions, d_offsets_span, target_size, stream); - if (target_count == 0) { - // nothing to replace, copy the input column - return std::make_unique(strings.parent(), stream, mr); + __device__ void operator()(size_type idx) + { + if (d_strings.is_null(idx)) { + if (!d_chars) { d_offsets[idx] = 0; } + return; } - } + auto const d_str = d_strings.element(idx); + char const* in_ptr = d_str.data(); - // filter out any target positions that exceed the per-row match limit - if (maxrepl > 0 && target_count > maxrepl) { - target_count = filter_maxrepl_target_positions( - d_target_positions, target_count, d_offsets_span, maxrepl, stream); + size_type bytes = d_str.size_bytes(); + size_type spos = 0; + size_type lpos = 0; + char* out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr; + auto max_n = (maxrepl < 0) ? d_str.length() : maxrepl; + + // check each character against each target + while (spos < d_str.size_bytes() && (max_n > 0)) { + auto const d_tgt = d_target; + if ((d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) && // check fit + (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0)) // and match + { + auto const d_repl = d_replacement; + bytes += d_repl.size_bytes() - d_tgt.size_bytes(); + if (out_ptr) { + out_ptr = copy_and_increment(out_ptr, in_ptr + lpos, spos - lpos); + out_ptr = copy_string(out_ptr, d_repl); + lpos = spos + d_tgt.size_bytes(); + } + spos += d_tgt.size_bytes() - 1; + --max_n; + } + ++spos; + } + if (out_ptr) { // copy remainder + memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos); + } else { + d_offsets[idx] = bytes; + } } +}; - // build the offsets column - auto offsets_column = make_numeric_column( - data_type{type_id::INT32}, offset_count, mask_state::UNALLOCATED, stream, mr); - auto offsets_view = offsets_column->mutable_view(); - auto delta_per_target = d_repl.size_bytes() - target_size; - device_span d_target_positions_span(d_target_positions, target_count); - auto offsets_update_fn = cuda::proclaim_return_type( - [d_target_positions_span, delta_per_target, chars_start] __device__(int32_t offset) -> int32_t { - // determine the number of target positions occurring before this offset - size_type const* next_target_pos_ptr = thrust::lower_bound( - thrust::seq, d_target_positions_span.begin(), d_target_positions_span.end(), offset); - size_type num_prev_targets = - thrust::distance(d_target_positions_span.data(), next_target_pos_ptr); - return offset - chars_start + delta_per_target * num_prev_targets; - }); - thrust::transform(rmm::exec_policy(stream), - d_offsets_span.begin(), - d_offsets_span.end(), - offsets_view.begin(), - offsets_update_fn); - - // build the characters column - rmm::device_uvector chars(chars_bytes + (delta_per_target * target_count), stream, mr); - auto d_out_chars = chars.data(); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(chars_start), - chars_bytes, - target_replacer_fn{ - d_target_positions_span, d_in_chars, d_out_chars, target_size, d_repl, chars_start}); - - // free the target positions buffer as it is no longer needed - (void)target_positions.release(); - - return make_strings_column(strings_count, - std::move(offsets_column), - chars.release(), - strings.null_count(), - cudf::detail::copy_bitmask(strings.parent(), stream, mr)); -} - -/** - * @brief Scalar string replacement using a row-level parallel algorithm. - * - * Replaces occurrences of the target string with the replacement string using an algorithm with - * row-level parallelism. This algorithm will perform well when the strings in the string - * column are relatively short. - * @see BYTES_PER_VALID_ROW_THRESHOLD - * - * @param strings String column to search for target strings. - * @param d_target String to search for within the string column. - * @param d_repl Replacement string if target string is found. - * @param maxrepl Maximum times to replace if target appears multiple times in a string. - * @param stream CUDA stream to use for device operations - * @param mr Device memory resource used to allocate the returned column's device memory - * @return New strings column. - */ -std::unique_ptr replace_row_parallel(strings_column_view const& strings, - string_view const& d_target, - string_view const& d_repl, - int32_t maxrepl, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr replace_string_parallel(strings_column_view const& input, + string_view const& d_target, + string_view const& d_replacement, + cudf::size_type maxrepl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - auto d_strings = column_device_view::create(strings.parent(), stream); + auto d_strings = column_device_view::create(input.parent(), stream); - // this utility calls the given functor to build the offsets and chars columns auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( - replace_row_parallel_fn{*d_strings, d_target, d_repl, maxrepl}, strings.size(), stream, mr); + replace_fn{*d_strings, d_target, d_replacement, maxrepl}, input.size(), stream, mr); - return make_strings_column(strings.size(), + return make_strings_column(input.size(), std::move(offsets_column), chars.release(), - strings.null_count(), - cudf::detail::copy_bitmask(strings.parent(), stream, mr)); + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); } } // namespace -std::unique_ptr replace(strings_column_view const& strings, +std::unique_ptr replace(strings_column_view const& input, string_scalar const& target, string_scalar const& repl, - int32_t maxrepl, + cudf::size_type maxrepl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (strings.is_empty()) return make_empty_column(type_id::STRING); - if (maxrepl == 0) return std::make_unique(strings.parent(), stream, mr); + if (input.is_empty()) { return make_empty_column(type_id::STRING); } + if (maxrepl == 0) { return std::make_unique(input.parent(), stream, mr); } CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid."); CUDF_EXPECTS(target.is_valid(stream), "Parameter target must be valid."); CUDF_EXPECTS(target.size() > 0, "Parameter target must not be empty string."); @@ -558,25 +425,11 @@ std::unique_ptr replace(strings_column_view const& strings, string_view d_target(target.data(), target.size()); string_view d_repl(repl.data(), repl.size()); - // determine range of characters in the base column - auto const strings_count = strings.size(); - auto const offset_count = strings_count + 1; - auto const d_offsets = strings.offsets().data() + strings.offset(); - size_type const chars_start = - (strings.offset() == 0) - ? 0 - : cudf::detail::get_value(strings.offsets(), strings.offset(), stream); - size_type const chars_end = (offset_count == strings.offsets().size()) - ? strings.chars_size(stream) - : cudf::detail::get_value( - strings.offsets(), strings.offset() + strings_count, stream); - size_type const chars_bytes = chars_end - chars_start; - - auto const avg_bytes_per_row = chars_bytes / std::max(strings_count - strings.null_count(), 1); - return (avg_bytes_per_row < BYTES_PER_VALID_ROW_THRESHOLD) - ? replace_row_parallel(strings, d_target, d_repl, maxrepl, stream, mr) - : replace_char_parallel( - strings, chars_start, chars_end, d_target, d_repl, maxrepl, stream, mr); + return (input.size() == input.null_count() || + ((input.chars_size(stream) / (input.size() - input.null_count())) < + AVG_CHAR_BYTES_THRESHOLD)) + ? replace_string_parallel(input, d_target, d_repl, maxrepl, stream, mr) + : replace_character_parallel(input, d_target, d_repl, maxrepl, stream, mr); } } // namespace detail diff --git a/cpp/src/strings/replace/replace_nulls.cu b/cpp/src/strings/replace/replace_nulls.cu index 26fb1c7819f..bbca4997f57 100644 --- a/cpp/src/strings/replace/replace_nulls.cu +++ b/cpp/src/strings/replace/replace_nulls.cu @@ -36,18 +36,18 @@ namespace cudf { namespace strings { namespace detail { -std::unique_ptr replace_nulls(strings_column_view const& strings, +std::unique_ptr replace_nulls(strings_column_view const& input, string_scalar const& repl, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - size_type strings_count = strings.size(); - if (strings_count == 0) return make_empty_column(type_id::STRING); + size_type strings_count = input.size(); + if (strings_count == 0) { return make_empty_column(type_id::STRING); } CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid."); string_view d_repl(repl.data(), repl.size()); - auto strings_column = column_device_view::create(strings.parent(), stream); + auto strings_column = column_device_view::create(input.parent(), stream); auto d_strings = *strings_column; // build offsets column @@ -58,12 +58,12 @@ std::unique_ptr replace_nulls(strings_column_view const& strings, })); auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column( offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto d_offsets = offsets_column->view().data(); + auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view()); // build chars column rmm::device_uvector chars(bytes, stream, mr); auto d_chars = chars.data(); - thrust::for_each_n(rmm::exec_policy(stream), + thrust::for_each_n(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), strings_count, [d_strings, d_repl, d_offsets, d_chars] __device__(size_type idx) { diff --git a/cpp/src/strings/replace/replace_slice.cu b/cpp/src/strings/replace/replace_slice.cu index 041801336e6..c11664c86d4 100644 --- a/cpp/src/strings/replace/replace_slice.cu +++ b/cpp/src/strings/replace/replace_slice.cu @@ -50,7 +50,7 @@ struct replace_slice_fn { __device__ void operator()(size_type idx) { if (d_strings.is_null(idx)) { - if (!d_chars) d_offsets[idx] = 0; + if (!d_chars) { d_offsets[idx] = 0; } return; } auto const d_str = d_strings.element(idx); @@ -75,34 +75,37 @@ struct replace_slice_fn { } // namespace -std::unique_ptr replace_slice(strings_column_view const& strings, +std::unique_ptr replace_slice(strings_column_view const& input, string_scalar const& repl, size_type start, size_type stop, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (strings.is_empty()) return make_empty_column(type_id::STRING); + if (input.is_empty()) { return make_empty_column(type_id::STRING); } CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid."); - if (stop > 0) CUDF_EXPECTS(start <= stop, "Parameter start must be less than or equal to stop."); + if (stop > 0) { + CUDF_EXPECTS(start <= stop, "Parameter start must be less than or equal to stop."); + } string_view d_repl(repl.data(), repl.size()); - auto d_strings = column_device_view::create(strings.parent(), stream); + auto d_strings = column_device_view::create(input.parent(), stream); // this utility calls the given functor to build the offsets and chars columns auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( - replace_slice_fn{*d_strings, d_repl, start, stop}, strings.size(), stream, mr); + replace_slice_fn{*d_strings, d_repl, start, stop}, input.size(), stream, mr); - return make_strings_column(strings.size(), + return make_strings_column(input.size(), std::move(offsets_column), chars.release(), - strings.null_count(), - cudf::detail::copy_bitmask(strings.parent(), stream, mr)); + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); } + } // namespace detail -std::unique_ptr replace_slice(strings_column_view const& strings, +std::unique_ptr replace_slice(strings_column_view const& input, string_scalar const& repl, size_type start, size_type stop, @@ -110,7 +113,7 @@ std::unique_ptr replace_slice(strings_column_view const& strings, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::replace_slice(strings, repl, start, stop, stream, mr); + return detail::replace_slice(input, repl, start, stop, stream, mr); } } // namespace strings From 5192b608eeed4bda9317c657253c3a5630aa4c5d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 3 Apr 2024 09:11:37 -1000 Subject: [PATCH 006/842] Align date_range defaults with pandas, support tz (#15139) Precursor to https://github.com/rapidsai/cudf/issues/15116 * Aligns `date_range` signature with pandas, _technically_ an API breakage with `closed` changing defaults even though it still isn't supported * Copies pandas behavior of allowing `date_range` with just two of `start/end/periods` * Supports `tz` arg now Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15139 --- python/cudf/cudf/core/tools/datetimes.py | 49 +++++++++++++----------- python/cudf/cudf/tests/test_datetime.py | 16 ++++++++ 2 files changed, 43 insertions(+), 22 deletions(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 65f97c99934..ed8fca88acd 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -799,9 +799,11 @@ def date_range( periods=None, freq=None, tz=None, - normalize=False, + normalize: bool = False, name=None, - closed=None, + closed: Literal["left", "right", "both", "neither"] = "both", + *, + unit: Optional[str] = None, ): """Return a fixed frequency DatetimeIndex. @@ -837,8 +839,13 @@ def date_range( name : str, default None Name of the resulting DatetimeIndex - closed : {None, 'left', 'right'}, optional - Not Supported + closed : {"left", "right", "both", "neither"}, default "both" + Whether to set each bound as closed or open. + Currently only "both" is supported + + unit : str, default None + Specify the desired resolution of the result. Currently + not supported. Returns ------- @@ -875,11 +882,15 @@ def date_range( '2026-04-23 08:00:00'], dtype='datetime64[ns]') """ - if tz is not None: - raise NotImplementedError("tz is currently unsupported.") + if closed != "both": + raise NotImplementedError(f"{closed=} is currently unsupported.") + if unit is not None: + raise NotImplementedError(f"{unit=} is currently unsupported.") + if normalize is not False: + raise NotImplementedError(f"{normalize=} is currently unsupported.") - if closed is not None: - raise NotImplementedError("closed is currently unsupported.") + if freq is None and any(arg is None for arg in (start, end, periods)): + freq = "D" if (start, end, periods, freq).count(None) > 1: raise ValueError( @@ -894,7 +905,7 @@ def date_range( FutureWarning, ) - dtype = np.dtype(" bool: @@ -1026,14 +1039,6 @@ def _has_non_fixed_frequency(freq: DateOffset) -> bool: return len(freq.kwds.keys() & non_fixed_frequencies) > 0 -def _has_mixed_freqeuency(freq: DateOffset) -> bool: - """Utility to determine if `freq` contains mixed fixed and non-fixed - frequency offset. e.g. {months=1, days=5} - """ - - return _has_fixed_frequency(freq) and _has_non_fixed_frequency(freq) - - def _offset_to_nanoseconds_lower_bound(offset: DateOffset) -> int: """Given a DateOffset, which can consist of either fixed frequency or non-fixed frequency offset, convert to the smallest possible fixed diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 7c209078fd2..37ba7acf044 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -2357,3 +2357,19 @@ def test_timezone_array_notimplemented(): def test_to_datetime_errors_ignore_deprecated(): with pytest.warns(FutureWarning): cudf.to_datetime("2001-01-01 00:04:45", errors="ignore") + + +def test_date_range_freq_default(): + result = pd.date_range("2020-01-01", periods=2, name="foo") + expected = cudf.date_range("2020-01-01", periods=2, name="foo") + assert_eq(result, expected) + + +def test_date_range_tz(): + result = pd.date_range("2020-01-01", periods=2, tz="UTC") + expected = cudf.date_range("2020-01-01", periods=2, tz="UTC") + assert_eq(result, expected) + + result = pd.date_range("2020-01-01", "2020-01-02", periods=2, tz="UTC") + expected = cudf.date_range("2020-01-01", "2020-01-02", periods=2, tz="UTC") + assert_eq(result, expected) From fbaad8a480d3b2755afe04431c5abe6c098224b4 Mon Sep 17 00:00:00 2001 From: Tanmay Gujar Date: Wed, 3 Apr 2024 18:10:19 -0400 Subject: [PATCH 007/842] [FEA] Performance improvement for mixed left semi/anti join (#15288) Current implementation of mixed semi/anti join probes the built hash table twice -- once to find the output table size and once to build the output. Since the upper bound on output table size is O(N) where N is the size of the left table, we can avoid probing twice and achieve a faster join implementation. This implementation reserves the required upper memory bound, builds the output, and then collects the relevant output rows. This probes the hash table only once. This PR also removes the size kernels for mixed semi join and output size parameters passed to the mixed semi join. Closes #15250 # Benchmark Results from cudf repository ## mixed_left_semi_join_32bit (New implementation) ### [0] NVIDIA TITAN V ``` | Key Type | Payload Type | Nullable | Build Table Size | Probe Table Size | Samples | CPU Time | Noise | GPU Time | Noise | |----------|--------------|----------|------------------|------------------|---------|------------|-------|------------|-------| | I32 | I32 | 0 | 100000 | 100000 | 1920x | 266.239 us | 3.43% | 261.324 us | 2.84% | | I32 | I32 | 0 | 100000 | 400000 | 1024x | 495.434 us | 1.18% | 490.544 us | 0.63% | | I32 | I32 | 0 | 10000000 | 10000000 | 24x | 20.919 ms | 0.04% | 20.914 ms | 0.03% | | I32 | I32 | 0 | 10000000 | 40000000 | 11x | 54.697 ms | 0.03% | 54.692 ms | 0.03% | | I32 | I32 | 0 | 10000000 | 100000000 | 11x | 122.171 ms | 0.03% | 122.166 ms | 0.03% | | I32 | I32 | 0 | 80000000 | 100000000 | 11x | 192.979 ms | 0.01% | 192.975 ms | 0.01% | | I32 | I32 | 0 | 100000000 | 100000000 | 11x | 212.878 ms | 0.01% | 212.874 ms | 0.01% | | I32 | I32 | 0 | 10000000 | 240000000 | 11x | 279.794 ms | 0.01% | 279.790 ms | 0.01% | | I32 | I32 | 0 | 80000000 | 240000000 | 11x | 351.186 ms | 0.01% | 351.183 ms | 0.01% | | I32 | I32 | 0 | 100000000 | 240000000 | 11x | 370.794 ms | 0.01% | 370.790 ms | 0.01% | ``` ## mixed_left_semi_join_32bit (Old implementation) ### [0] NVIDIA TITAN V ``` | Key Type | Payload Type | Nullable | Build Table Size | Probe Table Size | Samples | CPU Time | Noise | GPU Time | Noise | |----------|--------------|----------|------------------|------------------|---------|------------|-------|------------|-------| | I32 | I32 | 0 | 100000 | 100000 | 1392x | 368.030 us | 3.05% | 363.065 us | 2.70% | | I32 | I32 | 0 | 100000 | 400000 | 832x | 832.492 us | 0.84% | 827.586 us | 0.60% | | I32 | I32 | 0 | 10000000 | 10000000 | 16x | 32.310 ms | 0.03% | 32.305 ms | 0.03% | | I32 | I32 | 0 | 10000000 | 40000000 | 11x | 100.222 ms | 0.03% | 100.218 ms | 0.03% | | I32 | I32 | 0 | 10000000 | 100000000 | 11x | 235.874 ms | 0.01% | 235.870 ms | 0.01% | | I32 | I32 | 0 | 80000000 | 100000000 | 11x | 307.042 ms | 0.01% | 307.038 ms | 0.01% | | I32 | I32 | 0 | 100000000 | 100000000 | 11x | 326.797 ms | 0.01% | 326.794 ms | 0.01% | | I32 | I32 | 0 | 10000000 | 240000000 | 11x | 552.730 ms | 0.01% | 552.728 ms | 0.01% | | I32 | I32 | 0 | 80000000 | 240000000 | 11x | 624.958 ms | 0.01% | 624.956 ms | 0.01% | | I32 | I32 | 0 | 100000000 | 240000000 | 11x | 644.148 ms | 0.00% | 644.146 ms | 0.00% | ``` Authors: - Tanmay Gujar (https://github.com/tgujar) - Yunsong Wang (https://github.com/PointKernel) Approvers: - Jason Lowe (https://github.com/jlowe) - Yunsong Wang (https://github.com/PointKernel) - Muhammad Haseeb (https://github.com/mhaseeb123) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15288 --- cpp/CMakeLists.txt | 1 - cpp/include/cudf/join.hpp | 90 +---- cpp/src/join/mixed_join_kernels_semi.cu | 31 +- cpp/src/join/mixed_join_kernels_semi.cuh | 64 +--- cpp/src/join/mixed_join_semi.cu | 360 ++---------------- cpp/src/join/mixed_join_size_kernels_semi.cu | 125 ------ cpp/tests/join/mixed_join_tests.cu | 41 -- java/src/main/java/ai/rapids/cudf/Table.java | 146 ------- java/src/main/native/src/TableJni.cpp | 60 --- .../test/java/ai/rapids/cudf/TableTest.java | 116 ------ 10 files changed, 42 insertions(+), 992 deletions(-) delete mode 100644 cpp/src/join/mixed_join_size_kernels_semi.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f1d43e3c35f..7c32474ea56 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -453,7 +453,6 @@ add_library( src/join/mixed_join_semi.cu src/join/mixed_join_size_kernel.cu src/join/mixed_join_size_kernel_nulls.cu - src/join/mixed_join_size_kernels_semi.cu src/join/semi_join.cu src/json/json_path.cu src/lists/contains.cu diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index b7a3129cfec..e343ad9ee32 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -944,9 +944,6 @@ mixed_full_join( * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not - * @param output_size_data An optional pair of values indicating the exact output size and the - * number of matches for each row in the larger of the two input tables, left or right (may be - * precomputed using the corresponding mixed_full_join_size API). * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -958,8 +955,7 @@ std::unique_ptr> mixed_left_semi_join( table_view const& left_conditional, table_view const& right_conditional, ast::expression const& binary_predicate, - null_equality compare_nulls = null_equality::EQUAL, - std::optional>> output_size_data = {}, + null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -996,9 +992,6 @@ std::unique_ptr> mixed_left_semi_join( * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not - * @param output_size_data An optional pair of values indicating the exact output size and the - * number of matches for each row in the larger of the two input tables, left or right (may be - * precomputed using the corresponding mixed_full_join_size API). * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -1010,8 +1003,7 @@ std::unique_ptr> mixed_left_anti_join( table_view const& left_conditional, table_view const& right_conditional, ast::expression const& binary_predicate, - null_equality compare_nulls = null_equality::EQUAL, - std::optional>> output_size_data = {}, + null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -1094,84 +1086,6 @@ std::pair>> mixed_le null_equality compare_nulls = null_equality::EQUAL, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @brief Returns the exact number of matches (rows) when performing a mixed - * left semi join between the specified tables where the columns of the - * equality table are equal and the predicate evaluates to true on the - * conditional tables. - * - * If the provided predicate returns NULL for a pair of rows (left, right), - * that pair is not included in the output. It is the user's responsibility to - * choose a suitable compare_nulls value AND use appropriate null-safe - * operators in the expression. - * - * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. - * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not - * match. - * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not - * match. - * - * @param left_equality The left table used for the equality join - * @param right_equality The right table used for the equality join - * @param left_conditional The left table used for the conditional join - * @param right_conditional The right table used for the conditional join - * @param binary_predicate The condition on which to join - * @param compare_nulls Whether or not null values join to each other or not - * @param mr Device memory resource used to allocate the returned table and columns' device memory - * - * @return A pair containing the size that would result from performing the - * requested join and the number of matches for each row in one of the two - * tables. Which of the two tables is an implementation detail and should not - * be relied upon, simply passed to the corresponding `mixed_left_join` API as - * is. - */ -std::pair>> mixed_left_semi_join_size( - table_view const& left_equality, - table_view const& right_equality, - table_view const& left_conditional, - table_view const& right_conditional, - ast::expression const& binary_predicate, - null_equality compare_nulls = null_equality::EQUAL, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Returns the exact number of matches (rows) when performing a mixed - * left anti join between the specified tables. - * - * If the provided predicate returns NULL for a pair of rows (left, right), - * that pair is not included in the output. It is the user's responsibility to - * choose a suitable compare_nulls value AND use appropriate null-safe - * operators in the expression. - * - * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. - * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not - * match. - * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not - * match. - * - * @param left_equality The left table used for the equality join - * @param right_equality The right table used for the equality join - * @param left_conditional The left table used for the conditional join - * @param right_conditional The right table used for the conditional join - * @param binary_predicate The condition on which to join - * @param compare_nulls Whether or not null values join to each other or not - * @param mr Device memory resource used to allocate the returned table and columns' device memory - * - * @return A pair containing the size that would result from performing the - * requested join and the number of matches for each row in one of the two - * tables. Which of the two tables is an implementation detail and should not - * be relied upon, simply passed to the corresponding `mixed_left_join` API as - * is. - */ -std::pair>> mixed_left_anti_join_size( - table_view const& left_equality, - table_view const& right_equality, - table_view const& left_conditional, - table_view const& right_conditional, - ast::expression const& binary_predicate, - null_equality compare_nulls = null_equality::EQUAL, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** * @brief Returns the exact number of matches (rows) when performing a * conditional inner join between the specified tables where the predicate diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu index 5a543997a50..01e3fe09b38 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cu +++ b/cpp/src/join/mixed_join_kernels_semi.cu @@ -41,12 +41,9 @@ __attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__ table_device_view build, row_hash const hash_probe, row_equality const equality_probe, - join_kind const join_type, cudf::detail::semi_map_type::device_view hash_table_view, - size_type* join_output_l, - cudf::ast::detail::expression_device_view device_expression_data, - cudf::size_type const* join_result_offsets, - bool const swap_tables) + cudf::device_span left_table_keep_mask, + cudf::ast::detail::expression_device_view device_expression_data) { // Normally the casting of a shared memory array is used to create multiple // arrays of different types from the shared memory buffer, but here it is @@ -60,7 +57,7 @@ __attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__ cudf::size_type const left_num_rows = left_table.num_rows(); cudf::size_type const right_num_rows = right_table.num_rows(); - auto const outer_num_rows = (swap_tables ? right_num_rows : left_num_rows); + auto const outer_num_rows = left_num_rows; cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size; @@ -70,12 +67,10 @@ __attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__ if (outer_row_index < outer_num_rows) { // Figure out the number of elements for this key. auto equality = single_expression_equality{ - evaluator, thread_intermediate_storage, swap_tables, equality_probe}; + evaluator, thread_intermediate_storage, false, equality_probe}; - if ((join_type == join_kind::LEFT_ANTI_JOIN) != - (hash_table_view.contains(outer_row_index, hash_probe, equality))) { - *(join_output_l + join_result_offsets[outer_row_index]) = outer_row_index; - } + left_table_keep_mask[outer_row_index] = + hash_table_view.contains(outer_row_index, hash_probe, equality); } } @@ -86,12 +81,9 @@ template __global__ void mixed_join_semi( table_device_view build, row_hash const hash_probe, row_equality const equality_probe, - join_kind const join_type, cudf::detail::semi_map_type::device_view hash_table_view, - size_type* join_output_l, - cudf::ast::detail::expression_device_view device_expression_data, - cudf::size_type const* join_result_offsets, - bool const swap_tables); + cudf::device_span left_table_keep_mask, + cudf::ast::detail::expression_device_view device_expression_data); template __global__ void mixed_join_semi( table_device_view left_table, @@ -100,12 +92,9 @@ template __global__ void mixed_join_semi( table_device_view build, row_hash const hash_probe, row_equality const equality_probe, - join_kind const join_type, cudf::detail::semi_map_type::device_view hash_table_view, - size_type* join_output_l, - cudf::ast::detail::expression_device_view device_expression_data, - cudf::size_type const* join_result_offsets, - bool const swap_tables); + cudf::device_span left_table_keep_mask, + cudf::ast::detail::expression_device_view device_expression_data); } // namespace detail diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh index f411d36f0a8..4ea404d451c 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cuh +++ b/cpp/src/join/mixed_join_kernels_semi.cuh @@ -27,53 +27,7 @@ namespace cudf { namespace detail { /** - * @brief Computes the output size of joining the left table to the right table for semi/anti joins. - * - * This method probes the hash table with each row in the probe table using a - * custom equality comparator that also checks that the conditional expression - * evaluates to true between the left/right tables when a match is found - * between probe and build rows. - * - * @tparam block_size The number of threads per block for this kernel - * @tparam has_nulls Whether or not the inputs may contain nulls. - * - * @param[in] left_table The left table - * @param[in] right_table The right table - * @param[in] probe The table with which to probe the hash table for matches. - * @param[in] build The table with which the hash table was built. - * @param[in] hash_probe The hasher used for the probe table. - * @param[in] equality_probe The equality comparator used when probing the hash table. - * @param[in] join_type The type of join to be performed - * @param[in] hash_table_view The hash table built from `build`. - * @param[in] device_expression_data Container of device data required to evaluate the desired - * expression. - * @param[in] swap_tables If true, the kernel was launched with one thread per right row and - * the kernel needs to internally loop over left rows. Otherwise, loop over right rows. - * @param[out] output_size The resulting output size - * @param[out] matches_per_row The number of matches in one pair of - * equality/conditional tables for each row in the other pair of tables. If - * swap_tables is true, matches_per_row corresponds to the right_table, - * otherwise it corresponds to the left_table. Note that corresponding swap of - * left/right tables to determine which is the build table and which is the - * probe table has already happened on the host. - */ -template -__global__ void compute_mixed_join_output_size_semi( - table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::semi_map_type::device_view hash_table_view, - ast::detail::expression_device_view device_expression_data, - bool const swap_tables, - std::size_t* output_size, - cudf::device_span matches_per_row); - -/** - * @brief Performs a semi/anti join using the combination of a hash lookup to + * @brief Performs a semi join using the combination of a hash lookup to * identify equal rows between one pair of tables and the evaluation of an * expression containing an arbitrary expression. * @@ -91,16 +45,11 @@ __global__ void compute_mixed_join_output_size_semi( * @param[in] build The table with which the hash table was built. * @param[in] hash_probe The hasher used for the probe table. * @param[in] equality_probe The equality comparator used when probing the hash table. - * @param[in] join_type The type of join to be performed * @param[in] hash_table_view The hash table built from `build`. - * @param[out] join_output_l The left result of the join operation + * @param[out] left_table_keep_mask The result of the join operation with "true" element indicating + * the corresponding index from left table is present in output * @param[in] device_expression_data Container of device data required to evaluate the desired * expression. - * @param[in] join_result_offsets The starting indices in join_output[l|r] - * where the matches for each row begin. Equivalent to a prefix sum of - * matches_per_row. - * @param[in] swap_tables If true, the kernel was launched with one thread per right row and - * the kernel needs to internally loop over left rows. Otherwise, loop over right rows. */ template __global__ void mixed_join_semi(table_device_view left_table, @@ -109,12 +58,9 @@ __global__ void mixed_join_semi(table_device_view left_table, table_device_view build, row_hash const hash_probe, row_equality const equality_probe, - join_kind const join_type, cudf::detail::semi_map_type::device_view hash_table_view, - size_type* join_output_l, - cudf::ast::detail::expression_device_view device_expression_data, - cudf::size_type const* join_result_offsets, - bool const swap_tables); + cudf::device_span left_table_keep_mask, + cudf::ast::detail::expression_device_view device_expression_data); } // namespace detail diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index edf6c32eadf..d654f580cad 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -92,7 +92,6 @@ std::unique_ptr> mixed_join_semi( ast::expression const& binary_predicate, null_equality compare_nulls, join_kind join_type, - std::optional>> output_size_data, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -107,12 +106,7 @@ std::unique_ptr> mixed_join_semi( auto const right_num_rows{right_conditional.num_rows()}; auto const left_num_rows{left_conditional.num_rows()}; - auto const swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows); - - // The "outer" table is the larger of the two tables. The kernels are - // launched with one thread per row of the outer table, which also means that - // it is the probe table for the hash - auto const outer_num_rows{swap_tables ? right_num_rows : left_num_rows}; + auto const outer_num_rows{left_num_rows}; // We can immediately filter out cases where the right table is empty. In // some cases, we return all the rows of the left table with a corresponding @@ -155,8 +149,8 @@ std::unique_ptr> mixed_join_semi( // TODO: The non-conditional join impls start with a dictionary matching, // figure out what that is and what it's needed for (and if conditional joins // need to do the same). - auto& probe = swap_tables ? right_equality : left_equality; - auto& build = swap_tables ? left_equality : right_equality; + auto& probe = left_equality; + auto& build = right_equality; auto probe_view = table_device_view::create(probe, stream); auto build_view = table_device_view::create(build, stream); auto left_conditional_view = table_device_view::create(left_conditional, stream); @@ -197,8 +191,7 @@ std::unique_ptr> mixed_join_semi( auto const equality_build_equality = row_comparator_build.equal_to(build_nulls, compare_nulls); auto const preprocessed_build_condtional = - experimental::row::equality::preprocessed_table::create( - swap_tables ? left_conditional : right_conditional, stream); + experimental::row::equality::preprocessed_table::create(right_conditional, stream); auto const row_comparator_conditional_build = cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional, preprocessed_build_condtional}; @@ -225,84 +218,14 @@ std::unique_ptr> mixed_join_semi( auto hash_table_view = hash_table.get_device_view(); - // For inner joins we support optimizing the join by launching one thread for - // whichever table is larger rather than always using the left table. detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE); auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block; - join_kind const kernel_join_type = - join_type == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : join_type; - - // If the join size data was not provided as an input, compute it here. - std::size_t join_size; - // Using an optional because we only need to allocate a new vector if one was - // not passed as input, and rmm::device_uvector is not default constructible - std::optional> matches_per_row{}; - device_span matches_per_row_span{}; auto const row_hash = cudf::experimental::row::hash::row_hasher{preprocessed_probe}; auto const hash_probe = row_hash.device_hasher(has_nulls); - if (output_size_data.has_value()) { - join_size = output_size_data->first; - matches_per_row_span = output_size_data->second; - } else { - // Allocate storage for the counter used to get the size of the join output - rmm::device_scalar size(0, stream, mr); - - matches_per_row = - rmm::device_uvector{static_cast(outer_num_rows), stream, mr}; - // Note that the view goes out of scope after this else statement, but the - // data owned by matches_per_row stays alive so the data pointer is valid. - auto mutable_matches_per_row_span = cudf::device_span{ - matches_per_row->begin(), static_cast(outer_num_rows)}; - matches_per_row_span = cudf::device_span{ - matches_per_row->begin(), static_cast(outer_num_rows)}; - if (has_nulls) { - compute_mixed_join_output_size_semi - <<>>( - *left_conditional_view, - *right_conditional_view, - *probe_view, - *build_view, - hash_probe, - equality_probe, - kernel_join_type, - hash_table_view, - parser.device_expression_data, - swap_tables, - size.data(), - mutable_matches_per_row_span); - } else { - compute_mixed_join_output_size_semi - <<>>( - *left_conditional_view, - *right_conditional_view, - *probe_view, - *build_view, - hash_probe, - equality_probe, - kernel_join_type, - hash_table_view, - parser.device_expression_data, - swap_tables, - size.data(), - mutable_matches_per_row_span); - } - join_size = size.value(stream); - } - - if (join_size == 0) { return std::make_unique>(0, stream, mr); } - - // Given the number of matches per row, we need to compute the offsets for insertion. - auto join_result_offsets = - rmm::device_uvector{static_cast(outer_num_rows), stream, mr}; - thrust::exclusive_scan(rmm::exec_policy{stream}, - matches_per_row_span.begin(), - matches_per_row_span.end(), - join_result_offsets.begin()); - - auto left_indices = std::make_unique>(join_size, stream, mr); - auto const& join_output_l = left_indices->data(); + // Vector used to indicate indices from left/probe table which are present in output + auto left_table_keep_mask = rmm::device_uvector(probe.num_rows(), stream); if (has_nulls) { mixed_join_semi @@ -313,12 +236,9 @@ std::unique_ptr> mixed_join_semi( *build_view, hash_probe, equality_probe, - kernel_join_type, hash_table_view, - join_output_l, - parser.device_expression_data, - join_result_offsets.data(), - swap_tables); + cudf::device_span(left_table_keep_mask), + parser.device_expression_data); } else { mixed_join_semi <<>>( @@ -328,235 +248,30 @@ std::unique_ptr> mixed_join_semi( *build_view, hash_probe, equality_probe, - kernel_join_type, hash_table_view, - join_output_l, - parser.device_expression_data, - join_result_offsets.data(), - swap_tables); + cudf::device_span(left_table_keep_mask), + parser.device_expression_data); } - return left_indices; -} - -std::pair>> -compute_mixed_join_output_size_semi(table_view const& left_equality, - table_view const& right_equality, - table_view const& left_conditional, - table_view const& right_conditional, - ast::expression const& binary_predicate, - null_equality compare_nulls, - join_kind join_type, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS( - (join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) && - (join_type != join_kind::FULL_JOIN), - "Inner, left, and full join size estimation should use compute_mixed_join_output_size."); - - CUDF_EXPECTS(left_conditional.num_rows() == left_equality.num_rows(), - "The left conditional and equality tables must have the same number of rows."); - CUDF_EXPECTS(right_conditional.num_rows() == right_equality.num_rows(), - "The right conditional and equality tables must have the same number of rows."); - - auto const right_num_rows{right_conditional.num_rows()}; - auto const left_num_rows{left_conditional.num_rows()}; - auto const swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows); - - // The "outer" table is the larger of the two tables. The kernels are - // launched with one thread per row of the outer table, which also means that - // it is the probe table for the hash - auto const outer_num_rows{swap_tables ? right_num_rows : left_num_rows}; - - auto matches_per_row = std::make_unique>( - static_cast(outer_num_rows), stream, mr); - auto matches_per_row_span = cudf::device_span{ - matches_per_row->begin(), static_cast(outer_num_rows)}; - - // We can immediately filter out cases where one table is empty. In - // some cases, we return all the rows of the other table with a corresponding - // null index for the empty table; in others, we return an empty output. - if (right_num_rows == 0) { - switch (join_type) { - // Left, left anti, and full all return all the row indices from left - // with a corresponding NULL from the right. - case join_kind::LEFT_ANTI_JOIN: { - thrust::fill(matches_per_row->begin(), matches_per_row->end(), 1); - return {left_num_rows, std::move(matches_per_row)}; - } - // Inner and left semi joins return empty output because no matches can exist. - case join_kind::LEFT_SEMI_JOIN: return {0, std::move(matches_per_row)}; - default: CUDF_FAIL("Invalid join kind."); break; - } - } else if (left_num_rows == 0) { - switch (join_type) { - // Left, left anti, left semi, and inner joins all return empty sets. - case join_kind::LEFT_ANTI_JOIN: - case join_kind::LEFT_SEMI_JOIN: { - thrust::fill(matches_per_row->begin(), matches_per_row->end(), 0); - return {0, std::move(matches_per_row)}; - } - default: CUDF_FAIL("Invalid join kind."); break; - } - } - - // If evaluating the expression may produce null outputs we create a nullable - // output column and follow the null-supporting expression evaluation code - // path. - auto const has_nulls = cudf::nullate::DYNAMIC{ - cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) || - binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream)}; - - auto const parser = ast::detail::expression_parser{ - binary_predicate, left_conditional, right_conditional, has_nulls, stream, mr}; - CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8, - "The expression must produce a boolean output."); - - // TODO: The non-conditional join impls start with a dictionary matching, - // figure out what that is and what it's needed for (and if conditional joins - // need to do the same). - auto& probe = swap_tables ? right_equality : left_equality; - auto& build = swap_tables ? left_equality : right_equality; - auto probe_view = table_device_view::create(probe, stream); - auto build_view = table_device_view::create(build, stream); - auto left_conditional_view = table_device_view::create(left_conditional, stream); - auto right_conditional_view = table_device_view::create(right_conditional, stream); - - auto const preprocessed_build = - experimental::row::equality::preprocessed_table::create(build, stream); - auto const preprocessed_probe = - experimental::row::equality::preprocessed_table::create(probe, stream); - auto const row_comparator = - cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build}; - auto const equality_probe = row_comparator.equal_to(has_nulls, compare_nulls); - - semi_map_type hash_table{compute_hash_table_size(build.num_rows()), - cuco::empty_key{std::numeric_limits::max()}, - cuco::empty_value{cudf::detail::JoinNoneValue}, - cudf::detail::cuco_allocator{stream}, - stream.value()}; - - // Create hash table containing all keys found in right table - // TODO: To add support for nested columns we will need to flatten in many - // places. However, this probably isn't worth adding any time soon since we - // won't be able to support AST conditions for those types anyway. - auto const build_nulls = cudf::nullate::DYNAMIC{cudf::has_nulls(build)}; - auto const row_hash_build = cudf::experimental::row::hash::row_hasher{preprocessed_build}; - auto const hash_build = row_hash_build.device_hasher(build_nulls); - // Since we may see multiple rows that are identical in the equality tables - // but differ in the conditional tables, the equality comparator used for - // insertion must account for both sets of tables. An alternative solution - // would be to use a multimap, but that solution would store duplicates where - // equality and conditional rows are equal, so this approach is preferable. - // One way to make this solution even more efficient would be to only include - // the columns of the conditional table that are used by the expression, but - // that requires additional plumbing through the AST machinery and is out of - // scope for now. - auto const row_comparator_build = - cudf::experimental::row::equality::two_table_comparator{preprocessed_build, preprocessed_build}; - auto const equality_build_equality = - row_comparator_build.equal_to(build_nulls, compare_nulls); - auto const preprocessed_build_condtional = - experimental::row::equality::preprocessed_table::create( - swap_tables ? left_conditional : right_conditional, stream); - auto const row_comparator_conditional_build = - cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional, - preprocessed_build_condtional}; - auto const equality_build_conditional = - row_comparator_conditional_build.equal_to(build_nulls, compare_nulls); - double_row_equality equality_build{equality_build_equality, equality_build_conditional}; - make_pair_function_semi pair_func_build{}; - - auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build); - - // skip rows that are null here. - if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) { - hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value()); - } else { - thrust::counting_iterator stencil(0); - auto const [row_bitmask, _] = - cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource()); - row_is_valid pred{static_cast(row_bitmask.data())}; - - // insert valid rows - hash_table.insert_if( - iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value()); - } - - auto hash_table_view = hash_table.get_device_view(); - - // For inner joins we support optimizing the join by launching one thread for - // whichever table is larger rather than always using the left table. - detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE); - auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block; - - // Allocate storage for the counter used to get the size of the join output - rmm::device_scalar size(0, stream, mr); - - auto const row_hash = cudf::experimental::row::hash::row_hasher{preprocessed_probe}; - auto const hash_probe = row_hash.device_hasher(has_nulls); - - // Determine number of output rows without actually building the output to simply - // find what the size of the output will be. - if (has_nulls) { - compute_mixed_join_output_size_semi - <<>>( - *left_conditional_view, - *right_conditional_view, - *probe_view, - *build_view, - hash_probe, - equality_probe, - join_type, - hash_table_view, - parser.device_expression_data, - swap_tables, - size.data(), - matches_per_row_span); - } else { - compute_mixed_join_output_size_semi - <<>>( - *left_conditional_view, - *right_conditional_view, - *probe_view, - *build_view, - hash_probe, - equality_probe, - join_type, - hash_table_view, - parser.device_expression_data, - swap_tables, - size.data(), - matches_per_row_span); - } - - return {size.value(stream), std::move(matches_per_row)}; + auto gather_map = std::make_unique>(probe.num_rows(), stream, mr); + + // gather_map_end will be the end of valid data in gather_map + auto gather_map_end = + thrust::copy_if(rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(probe.num_rows()), + left_table_keep_mask.begin(), + gather_map->begin(), + [join_type] __device__(bool keep_row) { + return keep_row == (join_type == detail::join_kind::LEFT_SEMI_JOIN); + }); + + gather_map->resize(thrust::distance(gather_map->begin(), gather_map_end), stream); + return gather_map; } } // namespace detail -std::pair>> mixed_left_semi_join_size( - table_view const& left_equality, - table_view const& right_equality, - table_view const& left_conditional, - table_view const& right_conditional, - ast::expression const& binary_predicate, - null_equality compare_nulls, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::compute_mixed_join_output_size_semi(left_equality, - right_equality, - left_conditional, - right_conditional, - binary_predicate, - compare_nulls, - detail::join_kind::LEFT_SEMI_JOIN, - cudf::get_default_stream(), - mr); -} - std::unique_ptr> mixed_left_semi_join( table_view const& left_equality, table_view const& right_equality, @@ -564,7 +279,6 @@ std::unique_ptr> mixed_left_semi_join( table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls, - std::optional>> output_size_data, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); @@ -575,32 +289,10 @@ std::unique_ptr> mixed_left_semi_join( binary_predicate, compare_nulls, detail::join_kind::LEFT_SEMI_JOIN, - output_size_data, cudf::get_default_stream(), mr); } -std::pair>> mixed_left_anti_join_size( - table_view const& left_equality, - table_view const& right_equality, - table_view const& left_conditional, - table_view const& right_conditional, - ast::expression const& binary_predicate, - null_equality compare_nulls, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::compute_mixed_join_output_size_semi(left_equality, - right_equality, - left_conditional, - right_conditional, - binary_predicate, - compare_nulls, - detail::join_kind::LEFT_ANTI_JOIN, - cudf::get_default_stream(), - mr); -} - std::unique_ptr> mixed_left_anti_join( table_view const& left_equality, table_view const& right_equality, @@ -608,7 +300,6 @@ std::unique_ptr> mixed_left_anti_join( table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls, - std::optional>> output_size_data, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); @@ -619,7 +310,6 @@ std::unique_ptr> mixed_left_anti_join( binary_predicate, compare_nulls, detail::join_kind::LEFT_ANTI_JOIN, - output_size_data, cudf::get_default_stream(), mr); } diff --git a/cpp/src/join/mixed_join_size_kernels_semi.cu b/cpp/src/join/mixed_join_size_kernels_semi.cu deleted file mode 100644 index 7a22ac60710..00000000000 --- a/cpp/src/join/mixed_join_size_kernels_semi.cu +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "join/join_common_utils.cuh" -#include "join/join_common_utils.hpp" -#include "join/mixed_join_common_utils.cuh" - -#include -#include -#include -#include -#include - -#include - -namespace cudf { -namespace detail { - -namespace cg = cooperative_groups; - -#pragma GCC diagnostic ignored "-Wattributes" - -template -__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__ - void compute_mixed_join_output_size_semi( - table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::semi_map_type::device_view hash_table_view, - ast::detail::expression_device_view device_expression_data, - bool const swap_tables, - std::size_t* output_size, - cudf::device_span matches_per_row) -{ - // The (required) extern storage of the shared memory array leads to - // conflicting declarations between different templates. The easiest - // workaround is to declare an arbitrary (here char) array type then cast it - // after the fact to the appropriate type. - extern __shared__ char raw_intermediate_storage[]; - cudf::ast::detail::IntermediateDataType* intermediate_storage = - reinterpret_cast*>(raw_intermediate_storage); - auto thread_intermediate_storage = - intermediate_storage + (threadIdx.x * device_expression_data.num_intermediates); - - std::size_t thread_counter{0}; - cudf::size_type const start_idx = threadIdx.x + blockIdx.x * block_size; - cudf::size_type const stride = block_size * gridDim.x; - cudf::size_type const left_num_rows = left_table.num_rows(); - cudf::size_type const right_num_rows = right_table.num_rows(); - auto const outer_num_rows = (swap_tables ? right_num_rows : left_num_rows); - - auto evaluator = cudf::ast::detail::expression_evaluator( - left_table, right_table, device_expression_data); - - // TODO: Address asymmetry in operator. - auto equality = single_expression_equality{ - evaluator, thread_intermediate_storage, swap_tables, equality_probe}; - - for (cudf::size_type outer_row_index = start_idx; outer_row_index < outer_num_rows; - outer_row_index += stride) { - matches_per_row[outer_row_index] = - ((join_type == join_kind::LEFT_ANTI_JOIN) != - (hash_table_view.contains(outer_row_index, hash_probe, equality))); - thread_counter += matches_per_row[outer_row_index]; - } - - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter); - - // Add block counter to global counter - if (threadIdx.x == 0) { - cuda::atomic_ref ref{*output_size}; - ref.fetch_add(block_counter, cuda::std::memory_order_relaxed); - } -} - -template __global__ void compute_mixed_join_output_size_semi( - table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::semi_map_type::device_view hash_table_view, - ast::detail::expression_device_view device_expression_data, - bool const swap_tables, - std::size_t* output_size, - cudf::device_span matches_per_row); - -template __global__ void compute_mixed_join_output_size_semi( - table_device_view left_table, - table_device_view right_table, - table_device_view probe, - table_device_view build, - row_hash const hash_probe, - row_equality const equality_probe, - join_kind const join_type, - cudf::detail::semi_map_type::device_view hash_table_view, - ast::detail::expression_device_view device_expression_data, - bool const swap_tables, - std::size_t* output_size, - cudf::device_span matches_per_row); - -} // namespace detail - -} // namespace cudf diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu index cc37dadffd8..6c147c8a128 100644 --- a/cpp/tests/join/mixed_join_tests.cu +++ b/cpp/tests/join/mixed_join_tests.cu @@ -657,10 +657,6 @@ struct MixedJoinSingleReturnTest : public MixedJoinTest { std::vector expected_outputs, cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) { - auto [result_size, actual_counts] = this->join_size( - left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls); - EXPECT_TRUE(result_size == expected_outputs.size()); - auto result = this->join( left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls); std::vector resulting_indices; @@ -751,19 +747,6 @@ struct MixedJoinSingleReturnTest : public MixedJoinTest { cudf::table_view right_conditional, cudf::ast::operation predicate, cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) = 0; - - /** - * This method must be implemented by subclasses for specific types of joins. - * It should be a simply forwarding of arguments to the appropriate cudf - * mixed join size computation API. - */ - virtual std::pair>> join_size( - cudf::table_view left_equality, - cudf::table_view right_equality, - cudf::table_view left_conditional, - cudf::table_view right_conditional, - cudf::ast::operation predicate, - cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) = 0; }; /** @@ -781,18 +764,6 @@ struct MixedLeftSemiJoinTest : public MixedJoinSingleReturnTest { return cudf::mixed_left_semi_join( left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls); } - - std::pair>> join_size( - cudf::table_view left_equality, - cudf::table_view right_equality, - cudf::table_view left_conditional, - cudf::table_view right_conditional, - cudf::ast::operation predicate, - cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override - { - return cudf::mixed_left_semi_join_size( - left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls); - } }; TYPED_TEST_SUITE(MixedLeftSemiJoinTest, cudf::test::IntegralTypesNotBool); @@ -874,18 +845,6 @@ struct MixedLeftAntiJoinTest : public MixedJoinSingleReturnTest { return cudf::mixed_left_anti_join( left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls); } - - std::pair>> join_size( - cudf::table_view left_equality, - cudf::table_view right_equality, - cudf::table_view left_conditional, - cudf::table_view right_conditional, - cudf::ast::operation predicate, - cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override - { - return cudf::mixed_left_anti_join_size( - left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls); - } }; TYPED_TEST_SUITE(MixedLeftAntiJoinTest, cudf::test::IntegralTypesNotBool); diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 5ce2f9d2d6e..4038b3a40b8 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -732,32 +732,14 @@ private static native long[] mixedFullJoinGatherMaps(long leftKeysTable, long ri long leftConditionTable, long rightConditionTable, long condition, boolean compareNullsEqual); - private static native long[] mixedLeftSemiJoinSize(long leftKeysTable, long rightKeysTable, - long leftConditionTable, long rightConditionTable, - long condition, boolean compareNullsEqual); - private static native long[] mixedLeftSemiJoinGatherMap(long leftKeysTable, long rightKeysTable, long leftConditionTable, long rightConditionTable, long condition, boolean compareNullsEqual); - private static native long[] mixedLeftSemiJoinGatherMapWithSize(long leftKeysTable, long rightKeysTable, - long leftConditionTable, long rightConditionTable, - long condition, boolean compareNullsEqual, - long outputRowCount, long matchesColumnView); - - private static native long[] mixedLeftAntiJoinSize(long leftKeysTable, long rightKeysTable, - long leftConditionTable, long rightConditionTable, - long condition, boolean compareNullsEqual); - private static native long[] mixedLeftAntiJoinGatherMap(long leftKeysTable, long rightKeysTable, long leftConditionTable, long rightConditionTable, long condition, boolean compareNullsEqual); - private static native long[] mixedLeftAntiJoinGatherMapWithSize(long leftKeysTable, long rightKeysTable, - long leftConditionTable, long rightConditionTable, - long condition, boolean compareNullsEqual, - long outputRowCount, long matchesColumnView); - private static native long[] crossJoin(long leftTable, long rightTable) throws CudfException; private static native long[] concatenate(long[] cudfTablePointers) throws CudfException; @@ -3747,34 +3729,6 @@ public GatherMap conditionalLeftSemiJoinGatherMap(Table rightTable, return buildSingleJoinGatherMap(gatherMapData); } - /** - * Computes output size information for a left semi join between two tables using a mix of - * equality and inequality conditions. The entire join condition is assumed to be a logical AND - * of the equality condition and inequality condition. - * NOTE: It is the responsibility of the caller to close the resulting size information object - * or native resources can be leaked! - * @param leftKeys the left table's key columns for the equality condition - * @param rightKeys the right table's key columns for the equality condition - * @param leftConditional the left table's columns needed to evaluate the inequality condition - * @param rightConditional the right table's columns needed to evaluate the inequality condition - * @param condition the inequality condition of the join - * @param nullEquality whether nulls should compare as equal - * @return size information for the join - */ - public static MixedJoinSize mixedLeftSemiJoinSize(Table leftKeys, Table rightKeys, - Table leftConditional, Table rightConditional, - CompiledExpression condition, - NullEquality nullEquality) { - long[] mixedSizeInfo = mixedLeftSemiJoinSize( - leftKeys.getNativeView(), rightKeys.getNativeView(), - leftConditional.getNativeView(), rightConditional.getNativeView(), - condition.getNativeHandle(), nullEquality == NullEquality.EQUAL); - assert mixedSizeInfo.length == 2; - long outputRowCount = mixedSizeInfo[0]; - long matchesColumnHandle = mixedSizeInfo[1]; - return new MixedJoinSize(outputRowCount, new ColumnVector(matchesColumnHandle)); - } - /** * Computes the gather map that can be used to manifest the result of a left semi join between * two tables using a mix of equality and inequality conditions. The entire join condition is @@ -3804,42 +3758,6 @@ public static GatherMap mixedLeftSemiJoinGatherMap(Table leftKeys, Table rightKe return buildSingleJoinGatherMap(gatherMapData); } - /** - * Computes the gather map that can be used to manifest the result of a left semi join between - * two tables using a mix of equality and inequality conditions. The entire join condition is - * assumed to be a logical AND of the equality condition and inequality condition. - * A {@link GatherMap} instance will be returned that can be used to gather - * the left table to produce the result of the left semi join. - * - * It is the responsibility of the caller to close the resulting gather map instances. - * - * This interface allows passing the size result from - * {@link #mixedLeftSemiJoinSize(Table, Table, Table, Table, CompiledExpression, NullEquality)} - * when the output size was computed previously. - * - * @param leftKeys the left table's key columns for the equality condition - * @param rightKeys the right table's key columns for the equality condition - * @param leftConditional the left table's columns needed to evaluate the inequality condition - * @param rightConditional the right table's columns needed to evaluate the inequality condition - * @param condition the inequality condition of the join - * @param nullEquality whether nulls should compare as equal - * @param joinSize mixed join size result - * @return left and right table gather maps - */ - public static GatherMap mixedLeftSemiJoinGatherMap(Table leftKeys, Table rightKeys, - Table leftConditional, Table rightConditional, - CompiledExpression condition, - NullEquality nullEquality, - MixedJoinSize joinSize) { - long[] gatherMapData = mixedLeftSemiJoinGatherMapWithSize( - leftKeys.getNativeView(), rightKeys.getNativeView(), - leftConditional.getNativeView(), rightConditional.getNativeView(), - condition.getNativeHandle(), - nullEquality == NullEquality.EQUAL, - joinSize.getOutputRowCount(), joinSize.getMatches().getNativeView()); - return buildSingleJoinGatherMap(gatherMapData); - } - /** * Computes the gather map that can be used to manifest the result of a left anti-join between * two tables. It is assumed this table instance holds the key columns from the left table, and @@ -3919,34 +3837,6 @@ public GatherMap conditionalLeftAntiJoinGatherMap(Table rightTable, return buildSingleJoinGatherMap(gatherMapData); } - /** - * Computes output size information for a left anti join between two tables using a mix of - * equality and inequality conditions. The entire join condition is assumed to be a logical AND - * of the equality condition and inequality condition. - * NOTE: It is the responsibility of the caller to close the resulting size information object - * or native resources can be leaked! - * @param leftKeys the left table's key columns for the equality condition - * @param rightKeys the right table's key columns for the equality condition - * @param leftConditional the left table's columns needed to evaluate the inequality condition - * @param rightConditional the right table's columns needed to evaluate the inequality condition - * @param condition the inequality condition of the join - * @param nullEquality whether nulls should compare as equal - * @return size information for the join - */ - public static MixedJoinSize mixedLeftAntiJoinSize(Table leftKeys, Table rightKeys, - Table leftConditional, Table rightConditional, - CompiledExpression condition, - NullEquality nullEquality) { - long[] mixedSizeInfo = mixedLeftAntiJoinSize( - leftKeys.getNativeView(), rightKeys.getNativeView(), - leftConditional.getNativeView(), rightConditional.getNativeView(), - condition.getNativeHandle(), nullEquality == NullEquality.EQUAL); - assert mixedSizeInfo.length == 2; - long outputRowCount = mixedSizeInfo[0]; - long matchesColumnHandle = mixedSizeInfo[1]; - return new MixedJoinSize(outputRowCount, new ColumnVector(matchesColumnHandle)); - } - /** * Computes the gather map that can be used to manifest the result of a left anti join between * two tables using a mix of equality and inequality conditions. The entire join condition is @@ -3976,42 +3866,6 @@ public static GatherMap mixedLeftAntiJoinGatherMap(Table leftKeys, Table rightKe return buildSingleJoinGatherMap(gatherMapData); } - /** - * Computes the gather map that can be used to manifest the result of a left anti join between - * two tables using a mix of equality and inequality conditions. The entire join condition is - * assumed to be a logical AND of the equality condition and inequality condition. - * A {@link GatherMap} instance will be returned that can be used to gather - * the left table to produce the result of the left anti join. - * - * It is the responsibility of the caller to close the resulting gather map instances. - * - * This interface allows passing the size result from - * {@link #mixedLeftAntiJoinSize(Table, Table, Table, Table, CompiledExpression, NullEquality)} - * when the output size was computed previously. - * - * @param leftKeys the left table's key columns for the equality condition - * @param rightKeys the right table's key columns for the equality condition - * @param leftConditional the left table's columns needed to evaluate the inequality condition - * @param rightConditional the right table's columns needed to evaluate the inequality condition - * @param condition the inequality condition of the join - * @param nullEquality whether nulls should compare as equal - * @param joinSize mixed join size result - * @return left and right table gather maps - */ - public static GatherMap mixedLeftAntiJoinGatherMap(Table leftKeys, Table rightKeys, - Table leftConditional, Table rightConditional, - CompiledExpression condition, - NullEquality nullEquality, - MixedJoinSize joinSize) { - long[] gatherMapData = mixedLeftAntiJoinGatherMapWithSize( - leftKeys.getNativeView(), rightKeys.getNativeView(), - leftConditional.getNativeView(), rightConditional.getNativeView(), - condition.getNativeHandle(), - nullEquality == NullEquality.EQUAL, - joinSize.getOutputRowCount(), joinSize.getMatches().getNativeView()); - return buildSingleJoinGatherMap(gatherMapData); - } - /** * Construct a table from a packed representation. * @param metadata host-based metadata for the table diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 51b8eb853de..e8616710217 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -2838,20 +2838,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinGa }); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinSize( - JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition, - jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) { - return cudf::jni::mixed_join_size( - env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition, - j_nulls_equal, - [](cudf::table_view const &left_keys, cudf::table_view const &right_keys, - cudf::table_view const &left_condition, cudf::table_view const &right_condition, - cudf::ast::expression const &condition, cudf::null_equality nulls_equal) { - return cudf::mixed_left_semi_join_size(left_keys, right_keys, left_condition, - right_condition, condition, nulls_equal); - }); -} - JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinGatherMap( JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition, jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) { @@ -2866,22 +2852,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinGatherMa }); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinGatherMapWithSize( - JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition, - jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal, jlong j_output_row_count, - jlong j_matches_view) { - auto size_info = cudf::jni::get_mixed_size_info(env, j_output_row_count, j_matches_view); - return cudf::jni::mixed_join_gather_single_map( - env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition, - j_nulls_equal, - [&size_info](cudf::table_view const &left_keys, cudf::table_view const &right_keys, - cudf::table_view const &left_condition, cudf::table_view const &right_condition, - cudf::ast::expression const &condition, cudf::null_equality nulls_equal) { - return cudf::mixed_left_semi_join(left_keys, right_keys, left_condition, right_condition, - condition, nulls_equal, size_info); - }); -} - JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftAntiJoinGatherMap( JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) { return cudf::jni::join_gather_single_map( @@ -2930,20 +2900,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinGa }); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinSize( - JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition, - jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) { - return cudf::jni::mixed_join_size( - env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition, - j_nulls_equal, - [](cudf::table_view const &left_keys, cudf::table_view const &right_keys, - cudf::table_view const &left_condition, cudf::table_view const &right_condition, - cudf::ast::expression const &condition, cudf::null_equality nulls_equal) { - return cudf::mixed_left_anti_join_size(left_keys, right_keys, left_condition, - right_condition, condition, nulls_equal); - }); -} - JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinGatherMap( JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition, jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) { @@ -2958,22 +2914,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinGatherMa }); } -JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinGatherMapWithSize( - JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition, - jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal, jlong j_output_row_count, - jlong j_matches_view) { - auto size_info = cudf::jni::get_mixed_size_info(env, j_output_row_count, j_matches_view); - return cudf::jni::mixed_join_gather_single_map( - env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition, - j_nulls_equal, - [&size_info](cudf::table_view const &left_keys, cudf::table_view const &right_keys, - cudf::table_view const &left_condition, cudf::table_view const &right_condition, - cudf::ast::expression const &condition, cudf::null_equality nulls_equal) { - return cudf::mixed_left_anti_join(left_keys, right_keys, left_condition, right_condition, - condition, nulls_equal, size_info); - }); -} - JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_crossJoin(JNIEnv *env, jclass, jlong left_table, jlong right_table) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 30905783c7f..8560a9caad7 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -3058,64 +3058,6 @@ void testMixedLeftSemiJoinGatherMapNulls() { } } - @Test - void testMixedLeftSemiJoinGatherMapWithSize() { - BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER, - new ColumnReference(1, TableReference.LEFT), - new ColumnReference(1, TableReference.RIGHT)); - try (CompiledExpression condition = expr.compile(); - Table left = new Table.TestBuilder() - .column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8) - .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 0) - .build(); - Table leftKeys = new Table(left.getColumn(0)); - Table right = new Table.TestBuilder() - .column(6, 5, 9, 8, 10, 32) - .column(0, 1, 2, 3, 4, 5) - .column(7, 8, 9, 0, 1, 2).build(); - Table rightKeys = new Table(right.getColumn(0)); - Table expected = new Table.TestBuilder() - .column(2, 7, 8) - .build(); - MixedJoinSize sizeInfo = Table.mixedLeftSemiJoinSize(leftKeys, rightKeys, left, right, - condition, NullEquality.UNEQUAL)) { - assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount()); - try (GatherMap map = Table.mixedLeftSemiJoinGatherMap(leftKeys, rightKeys, left, right, - condition, NullEquality.UNEQUAL, sizeInfo)) { - verifySemiJoinGatherMap(map, expected); - } - } - } - - @Test - void testMixedLeftSemiJoinGatherMapNullsWithSize() { - BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER, - new ColumnReference(1, TableReference.LEFT), - new ColumnReference(1, TableReference.RIGHT)); - try (CompiledExpression condition = expr.compile(); - Table left = new Table.TestBuilder() - .column(null, 3, 9, 0, 1, 7, 4, null, 5, 8) - .column( 1, 2, 3, 4, 5, 6, 7, 8, 9, 0) - .build(); - Table leftKeys = new Table(left.getColumn(0)); - Table right = new Table.TestBuilder() - .column(null, 5, null, 8, 10, 32) - .column( 0, 1, 2, 3, 4, 5) - .column( 7, 8, 9, 0, 1, 2).build(); - Table rightKeys = new Table(right.getColumn(0)); - Table expected = new Table.TestBuilder() - .column(0, 7, 8) - .build(); - MixedJoinSize sizeInfo = Table.mixedLeftSemiJoinSize(leftKeys, rightKeys, left, right, - condition, NullEquality.EQUAL)) { - assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount()); - try (GatherMap map = Table.mixedLeftSemiJoinGatherMap(leftKeys, rightKeys, left, right, - condition, NullEquality.EQUAL, sizeInfo)) { - verifySemiJoinGatherMap(map, expected); - } - } - } - @Test void testMixedLeftAntiJoinGatherMap() { BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER, @@ -3166,64 +3108,6 @@ void testMixedLeftAntiJoinGatherMapNulls() { } } - @Test - void testMixedLeftAntiJoinGatherMapWithSize() { - BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER, - new ColumnReference(1, TableReference.LEFT), - new ColumnReference(1, TableReference.RIGHT)); - try (CompiledExpression condition = expr.compile(); - Table left = new Table.TestBuilder() - .column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8) - .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 0) - .build(); - Table leftKeys = new Table(left.getColumn(0)); - Table right = new Table.TestBuilder() - .column(6, 5, 9, 8, 10, 32) - .column(0, 1, 2, 3, 4, 5) - .column(7, 8, 9, 0, 1, 2).build(); - Table rightKeys = new Table(right.getColumn(0)); - Table expected = new Table.TestBuilder() - .column(0, 1, 3, 4, 5, 6, 9) - .build(); - MixedJoinSize sizeInfo = Table.mixedLeftAntiJoinSize(leftKeys, rightKeys, left, right, - condition, NullEquality.UNEQUAL)) { - assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount()); - try (GatherMap map = Table.mixedLeftAntiJoinGatherMap(leftKeys, rightKeys, left, right, - condition, NullEquality.UNEQUAL, sizeInfo)) { - verifySemiJoinGatherMap(map, expected); - } - } - } - - @Test - void testMixedLeftAntiJoinGatherMapNullsWithSize() { - BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER, - new ColumnReference(1, TableReference.LEFT), - new ColumnReference(1, TableReference.RIGHT)); - try (CompiledExpression condition = expr.compile(); - Table left = new Table.TestBuilder() - .column(null, 3, 9, 0, 1, 7, 4, null, 5, 8) - .column( 1, 2, 3, 4, 5, 6, 7, 8, 9, 0) - .build(); - Table leftKeys = new Table(left.getColumn(0)); - Table right = new Table.TestBuilder() - .column(null, 5, null, 8, 10, 32) - .column( 0, 1, 2, 3, 4, 5) - .column( 7, 8, 9, 0, 1, 2).build(); - Table rightKeys = new Table(right.getColumn(0)); - Table expected = new Table.TestBuilder() - .column(1, 2, 3, 4, 5, 6, 9) - .build(); - MixedJoinSize sizeInfo = Table.mixedLeftAntiJoinSize(leftKeys, rightKeys, left, right, - condition, NullEquality.EQUAL)) { - assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount()); - try (GatherMap map = Table.mixedLeftAntiJoinGatherMap(leftKeys, rightKeys, left, right, - condition, NullEquality.EQUAL, sizeInfo)) { - verifySemiJoinGatherMap(map, expected); - } - } - } - @Test void testLeftSemiJoinGatherMap() { try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build(); From 61dbfe8dc7635264465ce46d7de9e87ca0353267 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Thu, 4 Apr 2024 15:22:48 -0400 Subject: [PATCH 008/842] Allow jit compilation when using a splayed CUDA toolkit (#15451) The `JitifyPreprocessKernels.cmake` module now handles when `CUDAToolkit_INCLUDE_DIRS` has multiple values correctly, allowing for compilation with splayed CUDA Toolkit installs. Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15451 --- cpp/cmake/Modules/JitifyPreprocessKernels.cmake | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake index 8c4e2b47fca..752c2028350 100644 --- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake +++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake @@ -23,8 +23,9 @@ target_link_libraries(jitify_preprocess PUBLIC ${CMAKE_DL_LIBS}) function(jit_preprocess_files) cmake_parse_arguments(ARG "" "SOURCE_DIRECTORY" "FILES" ${ARGN}) - foreach(inc IN LISTS libcudacxx_raw_includes) - list(APPEND libcudacxx_includes "-I${inc}") + set(includes) + foreach(inc IN LISTS libcudacxx_raw_includes CUDAToolkit_INCLUDE_DIRS) + list(APPEND includes "-I${inc}") endforeach() foreach(ARG_FILE ${ARG_FILES}) set(ARG_OUTPUT ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files/${ARG_FILE}.jit.hpp) @@ -44,8 +45,7 @@ function(jit_preprocess_files) $ ${ARG_FILE} -o ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files -i -m -std=c++17 -remove-unused-globals -D_FILE_OFFSET_BITS=64 -D__CUDACC_RTC__ -I${CUDF_SOURCE_DIR}/include - -I${CUDF_SOURCE_DIR}/src ${libcudacxx_includes} -I${CUDAToolkit_INCLUDE_DIRS} - --no-preinclude-workarounds --no-replace-pragma-once + -I${CUDF_SOURCE_DIR}/src ${includes} --no-preinclude-workarounds --no-replace-pragma-once COMMENT "Custom command to JIT-compile files." ) endforeach() From c0f84bf5bbc7262015c42588fc1f4fd2b8e1b6c1 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Thu, 4 Apr 2024 15:24:04 -0400 Subject: [PATCH 009/842] Allow consumers of static builds to find nanoarrow (#15456) Allows consumers like spark-rapids to bring in libcudf static builds from the install and build trees. Authors: - Robert Maynard (https://github.com/robertmaynard) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15456 --- cpp/cmake/thirdparty/get_nanoarrow.cmake | 1 + .../thirdparty/patches/nanoarrow_cmake.diff | 39 +++++++++++++++---- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake index 4316db99a8d..884e5a2f368 100644 --- a/cpp/cmake/thirdparty/get_nanoarrow.cmake +++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake @@ -49,6 +49,7 @@ function(find_and_configure_nanoarrow) OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ) set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON) + rapids_export_find_package_root(BUILD nanoarrow "${nanoarrow_BINARY_DIR}" EXPORT_SET cudf-exports) endfunction() find_and_configure_nanoarrow( diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff b/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff index b53e134ed2c..1262a38c0a4 100644 --- a/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff +++ b/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff @@ -1,5 +1,5 @@ diff --git a/CMakeLists.txt b/CMakeLists.txt -index 8714c70..1feec13 100644 +index 8714c70..6a9e505 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,7 +49,6 @@ else() @@ -10,7 +10,15 @@ index 8714c70..1feec13 100644 # Avoids a warning about timestamps on downloaded files (prefer new policy # if available)) -@@ -111,6 +110,8 @@ if(NANOARROW_BUNDLE) +@@ -59,6 +58,7 @@ endif() + + configure_file(src/nanoarrow/nanoarrow_config.h.in generated/nanoarrow_config.h) + ++include(GNUInstallDirs) + if(NANOARROW_BUNDLE) + # Combine all headers into amalgamation/nanoarrow.h in the build directory + file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/amalgamation) +@@ -111,6 +111,8 @@ if(NANOARROW_BUNDLE) if(NANOARROW_BUILD_TESTS) include_directories(${CMAKE_BINARY_DIR}/amalgamation) add_library(nanoarrow ${NANOARROW_C_TEMP}) @@ -19,7 +27,7 @@ index 8714c70..1feec13 100644 target_compile_definitions(nanoarrow PUBLIC "$<$:NANOARROW_DEBUG>") endif() -@@ -120,6 +121,7 @@ if(NANOARROW_BUNDLE) +@@ -120,10 +122,11 @@ if(NANOARROW_BUNDLE) else() add_library(nanoarrow src/nanoarrow/array.c src/nanoarrow/schema.c src/nanoarrow/array_stream.c src/nanoarrow/utils.c) @@ -27,25 +35,31 @@ index 8714c70..1feec13 100644 target_include_directories(nanoarrow PUBLIC $ -@@ -154,13 +156,50 @@ else() +- $) ++ $) + target_include_directories(nanoarrow + PUBLIC $ + ) +@@ -154,13 +157,49 @@ else() endif() endif() - install(TARGETS nanoarrow DESTINATION lib) + install(TARGETS nanoarrow -+ DESTINATION lib ++ DESTINATION "${CMAKE_INSTALL_LIBDIR}" + EXPORT nanoarrow-exports) install(DIRECTORY src/ - DESTINATION include +- DESTINATION include ++ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" FILES_MATCHING - PATTERN "*.h") + PATTERN "*.h*") install(FILES ${CMAKE_CURRENT_BINARY_DIR}/generated/nanoarrow_config.h - DESTINATION include/nanoarrow) +- DESTINATION include/nanoarrow) ++ DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nanoarrow") + + # Generate package files for the build and install trees. + include(CMakePackageConfigHelpers) -+ include(GNUInstallDirs) + + foreach(tree_type BUILD INSTALL) + if(tree_type STREQUAL "BUILD") @@ -80,6 +94,15 @@ index 8714c70..1feec13 100644 endif() # Always build integration test if building tests +@@ -171,7 +210,7 @@ if(NANOARROW_BUILD_TESTS OR NANOARROW_BUILD_INTEGRATION_TESTS) + src/nanoarrow/integration/c_data_integration.cc) + target_include_directories(nanoarrow_c_data_integration + PUBLIC $ +- $) ++ $) + target_link_libraries(nanoarrow_c_data_integration PRIVATE nanoarrow nlohmann_json) + endif() + @@ -215,34 +254,18 @@ if(NANOARROW_BUILD_TESTS) src/nanoarrow/integration/c_data_integration_test.cc) From 8509054861f57379524982cc70db294d85a0dc5c Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 4 Apr 2024 16:09:45 -0400 Subject: [PATCH 010/842] Remove deprecated hash() and spark_murmurhash3_x86_32() (#15375) Remove deprecated libcudf hash functions. The `cudf::hash()` and `cudf::hashing::spark_murmurhash3_x86_32()` were deprecated in previous releases. The `cudf::hash_partition()` function still relies on the enum `hash_id` so it has been moved from `hashing.cpp` to `partitioning.hpp`. Calls to `cudf::hashing::spark_murmurhash3_x86_32()` were also removed from the JNI code. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - https://github.com/nvdbaranec - Jason Lowe (https://github.com/jlowe) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/15375 --- cpp/CMakeLists.txt | 2 - cpp/include/cudf/hashing.hpp | 52 -- cpp/include/cudf/hashing/detail/hashing.hpp | 5 - cpp/include/cudf/partitioning.hpp | 10 +- cpp/src/hash/hashing.cu | 53 -- cpp/src/hash/spark_murmurhash3_x86_32.cu | 442 -------------- .../hashing/spark_murmurhash3_x86_32_test.cpp | 576 ------------------ .../partitioning/hash_partition_test.cpp | 15 - .../java/ai/rapids/cudf/ColumnVector.java | 44 +- .../main/java/ai/rapids/cudf/HashType.java | 6 +- java/src/main/native/src/ColumnVectorJni.cpp | 10 +- .../java/ai/rapids/cudf/ColumnVectorTest.java | 219 ------- 12 files changed, 18 insertions(+), 1416 deletions(-) delete mode 100644 cpp/src/hash/hashing.cu delete mode 100644 cpp/src/hash/spark_murmurhash3_x86_32.cu delete mode 100644 cpp/tests/hashing/spark_murmurhash3_x86_32_test.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7c32474ea56..7d62e0acb10 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -346,7 +346,6 @@ add_library( src/groupby/sort/group_replace_nulls.cu src/groupby/sort/group_sum_scan.cu src/groupby/sort/sort_helper.cu - src/hash/hashing.cu src/hash/md5_hash.cu src/hash/murmurhash3_x86_32.cu src/hash/murmurhash3_x64_128.cu @@ -355,7 +354,6 @@ add_library( src/hash/sha256_hash.cu src/hash/sha384_hash.cu src/hash/sha512_hash.cu - src/hash/spark_murmurhash3_x86_32.cu src/hash/xxhash_64.cu src/interop/dlpack.cpp src/interop/from_arrow.cu diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index 64a78da1803..83962b50a10 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -34,42 +34,11 @@ namespace cudf { */ using hash_value_type = uint32_t; -/** - * @brief Identifies the hash function to be used - * - */ -enum class hash_id { - HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed - HASH_MURMUR3, ///< Murmur3 hash function - HASH_SPARK_MURMUR3, ///< Spark Murmur3 hash function - HASH_MD5 ///< MD5 hash function -}; - /** * @brief The default seed value for hash functions */ static constexpr uint32_t DEFAULT_HASH_SEED = 0; -/** - * @brief Computes the hash value of each row in the input set of columns. - * - * @deprecated Since 23.08 - * - * @param input The table of columns to hash - * @param hash_function The hash function enum to use - * @param seed Optional seed value to use for the hash function - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * - * @returns A column where each row is the hash of a column from the input - */ -[[deprecated]] std::unique_ptr hash( - table_view const& input, - hash_id hash_function = hash_id::HASH_MURMUR3, - uint32_t seed = DEFAULT_HASH_SEED, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - //! Hash APIs namespace hashing { @@ -112,27 +81,6 @@ std::unique_ptr murmurhash3_x64_128( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @brief Computes the MurmurHash3 32-bit hash value of each row in the given table - * - * @deprecated Since 24.04 - * - * This function computes the hash similar to MurmurHash3_x86_32 with special processing - * to match Spark's implementation results. - * - * @param input The table of columns to hash - * @param seed Optional seed value to use for the hash function - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * - * @returns A column where each row is the hash of a row from the input - */ -[[deprecated]] std::unique_ptr spark_murmurhash3_x86_32( - table_view const& input, - uint32_t seed = DEFAULT_HASH_SEED, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** * @brief Computes the MD5 hash value of each row in the given table * diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp index eaeb5d6b068..88a43a64638 100644 --- a/cpp/include/cudf/hashing/detail/hashing.hpp +++ b/cpp/include/cudf/hashing/detail/hashing.hpp @@ -37,11 +37,6 @@ std::unique_ptr
murmurhash3_x64_128(table_view const& input, rmm::cuda_stream_view, rmm::mr::device_memory_resource* mr); -std::unique_ptr spark_murmurhash3_x86_32(table_view const& input, - uint32_t seed, - rmm::cuda_stream_view, - rmm::mr::device_memory_resource* mr); - std::unique_ptr md5(table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp index 2c91bdf64f5..7033aa500a2 100644 --- a/cpp/include/cudf/partitioning.hpp +++ b/cpp/include/cudf/partitioning.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,6 +33,14 @@ namespace cudf { * @brief Column partitioning APIs */ +/** + * @brief Identifies the hash function to be used in hash partitioning + */ +enum class hash_id { + HASH_IDENTITY = 0, ///< Identity hash function that simply returns the key to be hashed + HASH_MURMUR3 ///< Murmur3 hash function +}; + /** * @brief Partitions rows of `t` according to the mapping specified by * `partition_map`. diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu deleted file mode 100644 index 68e02ef3cf4..00000000000 --- a/cpp/src/hash/hashing.cu +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include -#include - -#include - -namespace cudf { -namespace hashing { -namespace detail { - -std::unique_ptr hash(table_view const& input, - hash_id hash_function, - uint32_t seed, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - switch (hash_function) { - case (hash_id::HASH_MURMUR3): return murmurhash3_x86_32(input, seed, stream, mr); - case (hash_id::HASH_SPARK_MURMUR3): return spark_murmurhash3_x86_32(input, seed, stream, mr); - case (hash_id::HASH_MD5): return md5(input, stream, mr); - default: CUDF_FAIL("Unsupported hash function."); - } -} - -} // namespace detail -} // namespace hashing - -std::unique_ptr hash(table_view const& input, - hash_id hash_function, - uint32_t seed, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return hashing::detail::hash(input, hash_function, seed, stream, mr); -} - -} // namespace cudf diff --git a/cpp/src/hash/spark_murmurhash3_x86_32.cu b/cpp/src/hash/spark_murmurhash3_x86_32.cu deleted file mode 100644 index c7992b4afa0..00000000000 --- a/cpp/src/hash/spark_murmurhash3_x86_32.cu +++ /dev/null @@ -1,442 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -namespace cudf { -namespace hashing { -namespace detail { - -namespace { - -using spark_hash_value_type = int32_t; - -template ())> -struct Spark_MurmurHash3_x86_32 { - using result_type = spark_hash_value_type; - - constexpr Spark_MurmurHash3_x86_32() = default; - constexpr Spark_MurmurHash3_x86_32(uint32_t seed) : m_seed(seed) {} - - [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const - { - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - h ^= h >> 16; - return h; - } - - [[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data, - cudf::size_type offset) const - { - // Read a 4-byte value from the data pointer as individual bytes for safe - // unaligned access (very likely for string types). - auto block = reinterpret_cast(data + offset); - return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24); - } - - [[nodiscard]] result_type __device__ inline operator()(Key const& key) const - { - return compute(key); - } - - template - result_type __device__ inline compute(T const& key) const - { - return compute_bytes(reinterpret_cast(&key), sizeof(T)); - } - - result_type __device__ inline compute_remaining_bytes(std::byte const* data, - cudf::size_type len, - cudf::size_type tail_offset, - result_type h) const - { - // Process remaining bytes that do not fill a four-byte chunk using Spark's approach - // (does not conform to normal MurmurHash3). - for (auto i = tail_offset; i < len; i++) { - // We require a two-step cast to get the k1 value from the byte. First, - // we must cast to a signed int8_t. Then, the sign bit is preserved when - // casting to uint32_t under 2's complement. Java preserves the sign when - // casting byte-to-int, but C++ does not. - uint32_t k1 = static_cast(std::to_integer(data[i])); - k1 *= c1; - k1 = rotate_bits_left(k1, rot_c1); - k1 *= c2; - h ^= k1; - h = rotate_bits_left(static_cast(h), rot_c2); - h = h * 5 + c3; - } - return h; - } - - result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const - { - constexpr cudf::size_type BLOCK_SIZE = 4; - cudf::size_type const nblocks = len / BLOCK_SIZE; - cudf::size_type const tail_offset = nblocks * BLOCK_SIZE; - result_type h = m_seed; - - // Process all four-byte chunks. - for (cudf::size_type i = 0; i < nblocks; i++) { - uint32_t k1 = getblock32(data, i * BLOCK_SIZE); - k1 *= c1; - k1 = rotate_bits_left(k1, rot_c1); - k1 *= c2; - h ^= k1; - h = rotate_bits_left(static_cast(h), rot_c2); - h = h * 5 + c3; - } - - h = compute_remaining_bytes(data, len, tail_offset, h); - - // Finalize hash. - h ^= len; - h = fmix32(h); - return h; - } - - private: - uint32_t m_seed{cudf::DEFAULT_HASH_SEED}; - static constexpr uint32_t c1 = 0xcc9e2d51; - static constexpr uint32_t c2 = 0x1b873593; - static constexpr uint32_t c3 = 0xe6546b64; - static constexpr uint32_t rot_c1 = 15; - static constexpr uint32_t rot_c2 = 13; -}; - -template <> -spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32::operator()( - bool const& key) const -{ - return compute(key); -} - -template <> -spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32::operator()( - int8_t const& key) const -{ - return compute(key); -} - -template <> -spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32::operator()( - uint8_t const& key) const -{ - return compute(key); -} - -template <> -spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32::operator()( - int16_t const& key) const -{ - return compute(key); -} - -template <> -spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32::operator()( - uint16_t const& key) const -{ - return compute(key); -} - -template <> -spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32::operator()( - float const& key) const -{ - return compute(normalize_nans(key)); -} - -template <> -spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32::operator()( - double const& key) const -{ - return compute(normalize_nans(key)); -} - -template <> -spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32::operator()( - cudf::string_view const& key) const -{ - auto const data = reinterpret_cast(key.data()); - auto const len = key.size_bytes(); - return compute_bytes(data, len); -} - -template <> -spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32::operator()( - numeric::decimal32 const& key) const -{ - return compute(key.value()); -} - -template <> -spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32::operator()( - numeric::decimal64 const& key) const -{ - return compute(key.value()); -} - -template <> -spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32::operator()( - numeric::decimal128 const& key) const -{ - // Generates the Spark MurmurHash3 hash value, mimicking the conversion: - // java.math.BigDecimal.valueOf(unscaled_value, _scale).unscaledValue().toByteArray() - // https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala#L381 - __int128_t const val = key.value(); - constexpr cudf::size_type key_size = sizeof(__int128_t); - std::byte const* data = reinterpret_cast(&val); - - // Small negative values start with 0xff..., small positive values start with 0x00... - bool const is_negative = val < 0; - std::byte const zero_value = is_negative ? std::byte{0xff} : std::byte{0x00}; - - // If the value can be represented with a shorter than 16-byte integer, the - // leading bytes of the little-endian value are truncated and are not hashed. - auto const reverse_begin = thrust::reverse_iterator(data + key_size); - auto const reverse_end = thrust::reverse_iterator(data); - auto const first_nonzero_byte = - thrust::find_if_not(thrust::seq, reverse_begin, reverse_end, [zero_value](std::byte const& v) { - return v == zero_value; - }).base(); - // Max handles special case of 0 and -1 which would shorten to 0 length otherwise - cudf::size_type length = - std::max(1, static_cast(thrust::distance(data, first_nonzero_byte))); - - // Preserve the 2's complement sign bit by adding a byte back on if necessary. - // e.g. 0x0000ff would shorten to 0x00ff. The 0x00 byte is retained to - // preserve the sign bit, rather than leaving an "f" at the front which would - // change the sign bit. However, 0x00007f would shorten to 0x7f. No extra byte - // is needed because the leftmost bit matches the sign bit. Similarly for - // negative values: 0xffff00 --> 0xff00 and 0xffff80 --> 0x80. - if ((length < key_size) && (is_negative ^ bool(data[length - 1] & std::byte{0x80}))) { ++length; } - - // Convert to big endian by reversing the range of nonzero bytes. Only those bytes are hashed. - __int128_t big_endian_value = 0; - auto big_endian_data = reinterpret_cast(&big_endian_value); - thrust::reverse_copy(thrust::seq, data, data + length, big_endian_data); - return compute_bytes(big_endian_data, length); -} - -/** - * @brief Computes the hash value of a row in the given table. - * - * This functor uses Spark conventions for Murmur hashing, which differs from - * the Murmur implementation used in the rest of libcudf. These differences - * include: - * - Serially using the output hash as an input seed for the next item - * - Ignorance of null values - * - * The serial use of hashes as seeds means that data of different nested types - * can exhibit hash collisions. For example, a row of an integer column - * containing a 1 will have the same hash as a lists column of integers - * containing a list of [1] and a struct column of a single integer column - * containing a struct of {1}. - * - * As a consequence of ignoring null values, inputs like [1], [1, null], and - * [null, 1] have the same hash (an expected hash collision). This kind of - * collision can also occur across a table of nullable columns and with nulls - * in structs ({1, null} and {null, 1} have the same hash). The seed value (the - * previous element's hash value) is returned as the hash if an element is - * null. - * - * For additional differences such as special tail processing and decimal type - * handling, refer to the Spark_MurmurHash3_x86_32 functor. - * - * @tparam hash_function Hash functor to use for hashing elements. Must be Spark_MurmurHash3_x86_32. - * @tparam Nullate A cudf::nullate type describing whether to check for nulls. - */ -template