diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp index adc5bdb2af8..f16214260f7 100644 --- a/cpp/include/cudf/transform.hpp +++ b/cpp/include/cudf/transform.hpp @@ -47,6 +47,7 @@ namespace CUDF_EXPORT cudf { * @param unary_udf The PTX/CUDA string of the unary function to apply * @param output_type The output type that is compatible with the output type in the UDF * @param is_ptx true: the UDF is treated as PTX code; false: the UDF is treated as CUDA code + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return The column resulting from applying the unary function to * every element of the input @@ -56,6 +57,7 @@ std::unique_ptr transform( std::string const& unary_udf, data_type output_type, bool is_ptx, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** @@ -65,12 +67,14 @@ std::unique_ptr transform( * @throws cudf::logic_error if `input.type()` is a non-floating type * * @param input An immutable view of the input column of floating-point type + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned bitmask * @return A pair containing a `device_buffer` with the new bitmask and it's * null count obtained by replacing `NaN` in `input` with null. */ std::pair, size_type> nans_to_nulls( column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** @@ -83,12 +87,14 @@ std::pair, size_type> nans_to_nulls( * * @param table The table used for expression evaluation * @param expr The root of the expression tree + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource * @return Output column */ std::unique_ptr compute_column( table_view const& table, ast::expression const& expr, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** @@ -101,6 +107,7 @@ std::unique_ptr compute_column( * @throws cudf::logic_error if `input.type()` is a non-boolean type * * @param input Boolean elements to convert to a bitmask + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned bitmask * @return A pair containing a `device_buffer` with the new bitmask and it's * null count obtained from input considering `true` represent `valid`/`1` and @@ -108,6 +115,7 @@ std::unique_ptr compute_column( */ std::pair, cudf::size_type> bools_to_mask( column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** @@ -130,12 +138,14 @@ std::pair, cudf::size_type> bools_to_mask( * @endcode * * @param input Table containing values to be encoded + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * @return A pair containing the distinct row of the input table in sorter order, * and a column of integer indices representing the encoded rows. */ std::pair, std::unique_ptr> encode( cudf::table_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** @@ -162,12 +172,14 @@ std::pair, std::unique_ptr> encode( * * @param input Column containing values to be encoded * @param categories Column containing categories + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * @return A pair containing the owner to all encoded data and a table view into the data */ std::pair, table_view> one_hot_encode( column_view const& input, column_view const& categories, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** @@ -188,6 +200,7 @@ std::pair, table_view> one_hot_encode( * @param bitmask A device pointer to the bitmask which needs to be converted * @param begin_bit position of the bit from which the conversion should start * @param end_bit position of the bit before which the conversion should stop + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned columns' device memory * @return A boolean column representing the given mask from [begin_bit, end_bit) */ @@ -195,6 +208,7 @@ std::unique_ptr mask_to_bools( bitmask_type const* bitmask, size_type begin_bit, size_type end_bit, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** @@ -219,11 +233,14 @@ std::unique_ptr mask_to_bools( * row_bit_count(column(x)) >= row_bit_count(gather(column(x))) * * @param t The table view to perform the computation on + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned columns' device memory * @return A 32-bit integer column containing the per-row bit counts */ std::unique_ptr row_bit_count( - table_view const& t, rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + table_view const& t, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** * @brief Returns an approximate cumulative size in bits of all columns in the `table_view` for @@ -240,12 +257,14 @@ std::unique_ptr row_bit_count( * * @param t The table view to perform the computation on * @param segment_length The number of rows in each segment for which the total size is computed + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned columns' device memory * @return A 32-bit integer column containing the bit counts for each segment of rows */ std::unique_ptr segmented_row_bit_count( table_view const& t, size_type segment_length, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 3d41f856f4f..a867d4adfa1 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -247,7 +247,7 @@ std::shared_ptr dispatch_to_arrow::operator()(column_view in arrow::MemoryPool* ar_mr, rmm::cuda_stream_view stream) { - auto bitmask = bools_to_mask(input, stream, rmm::mr::get_current_device_resource()); + auto bitmask = detail::bools_to_mask(input, stream, rmm::mr::get_current_device_resource()); auto data_buffer = allocate_arrow_buffer(static_cast(bitmask.first->size()), ar_mr); diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu index cea7cdebcba..a5f3f9d87f5 100644 --- a/cpp/src/interop/to_arrow_device.cu +++ b/cpp/src/interop/to_arrow_device.cu @@ -200,7 +200,7 @@ int dispatch_to_arrow_device::operator()(cudf::column&& column, nanoarrow::UniqueArray tmp; NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column)); - auto bitmask = bools_to_mask(column.view(), stream, mr); + auto bitmask = detail::bools_to_mask(column.view(), stream, mr); auto contents = column.release(); NANOARROW_RETURN_NOT_OK(set_null_mask(contents, tmp.get())); NANOARROW_RETURN_NOT_OK( @@ -442,7 +442,7 @@ int dispatch_to_arrow_device_view::operator()(ArrowArray* out) const nanoarrow::UniqueArray tmp; NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column)); - auto bitmask = bools_to_mask(column, stream, mr); + auto bitmask = detail::bools_to_mask(column, stream, mr); NANOARROW_RETURN_NOT_OK( set_buffer(std::move(bitmask.first), fixed_width_data_buffer_idx, tmp.get())); NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get())); diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu index 193b3a3b5a2..26f7c7e6e53 100644 --- a/cpp/src/interop/to_arrow_host.cu +++ b/cpp/src/interop/to_arrow_host.cu @@ -147,7 +147,7 @@ int dispatch_to_arrow_host::operator()(ArrowArray* out) const NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column)); NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get()))); - auto bitmask = bools_to_mask(column, stream, mr); + auto bitmask = detail::bools_to_mask(column, stream, mr); NANOARROW_RETURN_NOT_OK(populate_data_buffer( device_span(reinterpret_cast(bitmask.first->data()), bitmask.first->size()), diff --git a/cpp/src/transform/bools_to_mask.cu b/cpp/src/transform/bools_to_mask.cu index c12f65deb46..452aebf4428 100644 --- a/cpp/src/transform/bools_to_mask.cu +++ b/cpp/src/transform/bools_to_mask.cu @@ -59,10 +59,10 @@ std::pair, cudf::size_type> bools_to_mask( } // namespace detail std::pair, cudf::size_type> bools_to_mask( - column_view const& input, rmm::device_async_resource_ref mr) + column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::bools_to_mask(input, cudf::get_default_stream(), mr); + return detail::bools_to_mask(input, stream, mr); } } // namespace cudf diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu index 7960731f3a1..c4fc8d58552 100644 --- a/cpp/src/transform/compute_column.cu +++ b/cpp/src/transform/compute_column.cu @@ -138,10 +138,11 @@ std::unique_ptr compute_column(table_view const& table, std::unique_ptr compute_column(table_view const& table, ast::expression const& expr, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::compute_column(table, expr, cudf::get_default_stream(), mr); + return detail::compute_column(table, expr, stream, mr); } } // namespace cudf diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu index 7a044b9f6f7..1c9d52bce1b 100644 --- a/cpp/src/transform/encode.cu +++ b/cpp/src/transform/encode.cu @@ -72,10 +72,10 @@ std::pair, std::unique_ptr> encode(table_view con } // namespace detail std::pair, std::unique_ptr> encode( - cudf::table_view const& input, rmm::device_async_resource_ref mr) + cudf::table_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::encode(input, cudf::get_default_stream(), mr); + return detail::encode(input, stream, mr); } } // namespace cudf diff --git a/cpp/src/transform/mask_to_bools.cu b/cpp/src/transform/mask_to_bools.cu index adf5db02d9c..be0b80a2633 100644 --- a/cpp/src/transform/mask_to_bools.cu +++ b/cpp/src/transform/mask_to_bools.cu @@ -62,9 +62,10 @@ std::unique_ptr mask_to_bools(bitmask_type const* bitmask, std::unique_ptr mask_to_bools(bitmask_type const* bitmask, size_type begin_bit, size_type end_bit, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::mask_to_bools(bitmask, begin_bit, end_bit, cudf::get_default_stream(), mr); + return detail::mask_to_bools(bitmask, begin_bit, end_bit, stream, mr); } } // namespace cudf diff --git a/cpp/src/transform/nans_to_nulls.cu b/cpp/src/transform/nans_to_nulls.cu index fd4f33c594c..a24ba304004 100644 --- a/cpp/src/transform/nans_to_nulls.cu +++ b/cpp/src/transform/nans_to_nulls.cu @@ -93,10 +93,10 @@ std::pair, cudf::size_type> nans_to_nulls( } // namespace detail std::pair, cudf::size_type> nans_to_nulls( - column_view const& input, rmm::device_async_resource_ref mr) + column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::nans_to_nulls(input, cudf::get_default_stream(), mr); + return detail::nans_to_nulls(input, stream, mr); } } // namespace cudf diff --git a/cpp/src/transform/one_hot_encode.cu b/cpp/src/transform/one_hot_encode.cu index 808f2d1b284..46e6e55b0b7 100644 --- a/cpp/src/transform/one_hot_encode.cu +++ b/cpp/src/transform/one_hot_encode.cu @@ -115,9 +115,10 @@ std::pair, table_view> one_hot_encode(column_view const& std::pair, table_view> one_hot_encode(column_view const& input, column_view const& categories, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::one_hot_encode(input, categories, cudf::get_default_stream(), mr); + return detail::one_hot_encode(input, categories, stream, mr); } } // namespace cudf diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu index 12a15eb7e34..4530fabf889 100644 --- a/cpp/src/transform/row_bit_count.cu +++ b/cpp/src/transform/row_bit_count.cu @@ -561,23 +561,26 @@ std::unique_ptr row_bit_count(table_view const& t, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return segmented_row_bit_count(t, 1, stream, mr); + return detail::segmented_row_bit_count(t, 1, stream, mr); } } // namespace detail std::unique_ptr segmented_row_bit_count(table_view const& t, size_type segment_length, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::segmented_row_bit_count(t, segment_length, cudf::get_default_stream(), mr); + return detail::segmented_row_bit_count(t, segment_length, stream, mr); } -std::unique_ptr row_bit_count(table_view const& t, rmm::device_async_resource_ref mr) +std::unique_ptr row_bit_count(table_view const& t, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::row_bit_count(t, cudf::get_default_stream(), mr); + return detail::row_bit_count(t, stream, mr); } } // namespace cudf diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp index 98ec44758b9..f5e9048fa0a 100644 --- a/cpp/src/transform/transform.cpp +++ b/cpp/src/transform/transform.cpp @@ -97,10 +97,11 @@ std::unique_ptr transform(column_view const& input, std::string const& unary_udf, data_type output_type, bool is_ptx, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::transform(input, unary_udf, output_type, is_ptx, cudf::get_default_stream(), mr); + return detail::transform(input, unary_udf, output_type, is_ptx, stream, mr); } } // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 4dffcb41ba2..96c30dc7ea3 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -735,6 +735,7 @@ ConfigureTest( STREAM_MODE testing ) +ConfigureTest(STREAM_TRANSFORM_TEST streams/transform_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_UNARY_TEST streams/unary_test.cpp STREAM_MODE testing) # ################################################################################################## diff --git a/cpp/tests/streams/transform_test.cpp b/cpp/tests/streams/transform_test.cpp new file mode 100644 index 00000000000..9187672221c --- /dev/null +++ b/cpp/tests/streams/transform_test.cpp @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +class TransformTest : public cudf::test::BaseFixture {}; + +template +void test_udf(char const udf[], Data data_init, cudf::size_type size, bool is_ptx) +{ + auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); + auto data_iter = cudf::detail::make_counting_transform_iterator(0, data_init); + cudf::test::fixed_width_column_wrapper in( + data_iter, data_iter + size, all_valid); + cudf::transform( + in, udf, cudf::data_type(cudf::type_to_id()), is_ptx, cudf::test::get_default_stream()); +} + +TEST_F(TransformTest, Transform) +{ + char const* cuda = + R"***( +__device__ inline void fdsf ( + float* C, + float a +) +{ + *C = a*a*a*a; +} +)***"; + + char const* ptx = + R"***( +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-24817639 +// Cuda compilation tools, release 10.0, V10.0.130 +// Based on LLVM 3.4svn +// + +.version 6.3 +.target sm_70 +.address_size 64 + + // .globl _ZN8__main__7add$241Ef +.common .global .align 8 .u64 _ZN08NumbaEnv8__main__7add$241Ef; +.common .global .align 8 .u64 _ZN08NumbaEnv5numba7targets7numbers14int_power_impl12$3clocals$3e13int_power$242Efx; + +.visible .func (.param .b32 func_retval0) _ZN8__main__7add$241Ef( + .param .b64 _ZN8__main__7add$241Ef_param_0, + .param .b32 _ZN8__main__7add$241Ef_param_1 +) +{ + .reg .f32 %f<4>; + .reg .b32 %r<2>; + .reg .b64 %rd<2>; + + + ld.param.u64 %rd1, [_ZN8__main__7add$241Ef_param_0]; + ld.param.f32 %f1, [_ZN8__main__7add$241Ef_param_1]; + mul.f32 %f2, %f1, %f1; + mul.f32 %f3, %f2, %f2; + st.f32 [%rd1], %f3; + mov.u32 %r1, 0; + st.param.b32 [func_retval0+0], %r1; + ret; +} +)***"; + + auto data_init = [](cudf::size_type row) { return row % 3; }; + test_udf(cuda, data_init, 500, false); + test_udf(ptx, data_init, 500, true); +} + +TEST_F(TransformTest, ComputeColumn) +{ + auto c_0 = cudf::test::fixed_width_column_wrapper{3, 20, 1, 50}; + auto c_1 = cudf::test::fixed_width_column_wrapper{10, 7, 20, 0}; + auto table = cudf::table_view{{c_0, c_1}}; + auto col_ref_0 = cudf::ast::column_reference(0); + auto col_ref_1 = cudf::ast::column_reference(1); + auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1); + cudf::compute_column(table, expression, cudf::test::get_default_stream()); +} + +TEST_F(TransformTest, BoolsToMask) +{ + std::vector input({1, 0, 1, 0, 1, 0, 1, 0}); + cudf::test::fixed_width_column_wrapper input_column(input.begin(), input.end()); + cudf::bools_to_mask(input_column, cudf::test::get_default_stream()); +} + +TEST_F(TransformTest, MaskToBools) +{ + cudf::mask_to_bools(nullptr, 0, 0, cudf::test::get_default_stream()); +} + +TEST_F(TransformTest, Encode) +{ + cudf::test::fixed_width_column_wrapper input{{1, 2, 3, 2, 3, 2, 1}}; + cudf::encode(cudf::table_view({input}), cudf::test::get_default_stream()); +} + +TEST_F(TransformTest, OneHotEncode) +{ + auto input = cudf::test::fixed_width_column_wrapper{8, 8, 8, 9, 9}; + auto category = cudf::test::fixed_width_column_wrapper{8, 9}; + cudf::one_hot_encode(input, category, cudf::test::get_default_stream()); +} + +TEST_F(TransformTest, NaNsToNulls) +{ + std::vector input = {1, 2, 3, 4, 5}; + std::vector mask = {true, true, true, true, false, false}; + auto input_column = + cudf::test::fixed_width_column_wrapper(input.begin(), input.end(), mask.begin()); + cudf::nans_to_nulls(input_column, cudf::test::get_default_stream()); +} + +TEST_F(TransformTest, RowBitCount) +{ + std::vector strings{"abc", "ï", "", "z", "bananas", "warp", "", "zing"}; + cudf::test::strings_column_wrapper col(strings.begin(), strings.end()); + cudf::row_bit_count(cudf::table_view({col}), cudf::test::get_default_stream()); +} + +TEST_F(TransformTest, SegmentedRowBitCount) +{ + // clang-format off + std::vector const strings { "daïs", "def", "", "z", "bananas", "warp", "", "zing" }; + std::vector const valids { 1, 0, 0, 1, 0, 1, 1, 1 }; + // clang-format on + cudf::test::strings_column_wrapper const col(strings.begin(), strings.end(), valids.begin()); + auto const input = cudf::table_view({col}); + auto constexpr segment_length = 2; + cudf::segmented_row_bit_count(input, segment_length, cudf::test::get_default_stream()); +}