diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index 3ef356bed1b..a3f76817f8a 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -393,6 +393,7 @@ class orc_reader_options_builder { * @endcode * * @param options Settings for controlling reading behavior + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the table in the returned * table_with_metadata. * @@ -400,6 +401,7 @@ class orc_reader_options_builder { */ table_with_metadata read_orc( orc_reader_options const& options, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group @@ -864,8 +866,10 @@ class orc_writer_options_builder { * @endcode * * @param options Settings for controlling reading behavior + * @param stream CUDA stream used for device memory operations and kernel launches */ -void write_orc(orc_writer_options const& options); +void write_orc(orc_writer_options const& options, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Builds settings to use for `write_orc_chunked()`. @@ -1287,8 +1291,10 @@ class orc_chunked_writer { * @brief Constructor with chunked writer options * * @param[in] options options used to write table + * @param[in] stream CUDA stream used for device memory operations and kernel launches */ - orc_chunked_writer(chunked_orc_writer_options const& options); + orc_chunked_writer(chunked_orc_writer_options const& options, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Writes table to output. diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp index 25e0c130dff..19d44263d1b 100644 --- a/cpp/include/cudf/io/orc_metadata.hpp +++ b/cpp/include/cudf/io/orc_metadata.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -63,10 +63,12 @@ struct raw_orc_statistics { * @endcode * * @param src_info Dataset source + * @param stream CUDA stream used for device memory operations and kernel launches * * @return Column names and encoded ORC statistics */ -raw_orc_statistics read_raw_orc_statistics(source_info const& src_info); +raw_orc_statistics read_raw_orc_statistics( + source_info const& src_info, rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Monostate type alias for the statistics variant. @@ -207,10 +209,12 @@ struct parsed_orc_statistics { * @ingroup io_readers * * @param src_info Dataset source + * @param stream CUDA stream used for device memory operations and kernel launches * * @return Column names and decoded ORC statistics */ -parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info); +parsed_orc_statistics read_parsed_orc_statistics( + source_info const& src_info, rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Schema of an ORC column, including the nested columns. @@ -368,10 +372,12 @@ class orc_metadata { * @ingroup io_readers * * @param src_info Dataset source + * @param stream CUDA stream used for device memory operations and kernel launches * * @return orc_metadata with ORC schema, number of rows and number of stripes. */ -orc_metadata read_orc_metadata(source_info const& src_info); +orc_metadata read_orc_metadata(source_info const& src_info, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** @} */ // end of group } // namespace io diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index e5489963618..42f2fd02d52 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -269,9 +269,9 @@ void write_csv(csv_writer_options const& options, mr); } -raw_orc_statistics read_raw_orc_statistics(source_info const& src_info) +raw_orc_statistics read_raw_orc_statistics(source_info const& src_info, + rmm::cuda_stream_view stream) { - auto stream = cudf::get_default_stream(); // Get source to read statistics from std::unique_ptr source; if (src_info.type() == io_type::FILEPATH) { @@ -342,9 +342,10 @@ column_statistics::column_statistics(orc::column_statistics&& cs) } } -parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info) +parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info, + rmm::cuda_stream_view stream) { - auto const raw_stats = read_raw_orc_statistics(src_info); + auto const raw_stats = read_raw_orc_statistics(src_info, stream); parsed_orc_statistics result; result.column_names = raw_stats.column_names; @@ -395,12 +396,12 @@ orc_column_schema make_orc_column_schema(host_span orc_sc } }; // namespace -orc_metadata read_orc_metadata(source_info const& src_info) +orc_metadata read_orc_metadata(source_info const& src_info, rmm::cuda_stream_view stream) { auto sources = make_datasources(src_info); CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported."); - auto const footer = orc::metadata(sources.front().get(), cudf::detail::default_stream_value).ff; + auto const footer = orc::metadata(sources.front().get(), stream).ff; return {{make_orc_column_schema(footer.types, 0, "")}, static_cast(footer.numberOfRows), @@ -410,21 +411,21 @@ orc_metadata read_orc_metadata(source_info const& src_info) /** * @copydoc cudf::io::read_orc */ -table_with_metadata read_orc(orc_reader_options const& options, rmm::mr::device_memory_resource* mr) +table_with_metadata read_orc(orc_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); auto datasources = make_datasources(options.get_source()); - auto reader = std::make_unique( - std::move(datasources), options, cudf::get_default_stream(), mr); - + auto reader = std::make_unique(std::move(datasources), options, stream, mr); return reader->read(options); } /** * @copydoc cudf::io::write_orc */ -void write_orc(orc_writer_options const& options) +void write_orc(orc_writer_options const& options, rmm::cuda_stream_view stream) { namespace io_detail = cudf::io::detail; @@ -434,8 +435,7 @@ void write_orc(orc_writer_options const& options) CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for ORC writing"); auto writer = std::make_unique( - std::move(sinks[0]), options, io_detail::single_write_mode::YES, cudf::get_default_stream()); - + std::move(sinks[0]), options, io_detail::single_write_mode::YES, stream); try { writer->write(options.get_table()); } catch (...) { @@ -451,7 +451,8 @@ void write_orc(orc_writer_options const& options) /** * @copydoc cudf::io::orc_chunked_writer::orc_chunked_writer */ -orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& options) +orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& options, + rmm::cuda_stream_view stream) { namespace io_detail = cudf::io::detail; @@ -459,7 +460,7 @@ orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& options CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for ORC writing"); writer = std::make_unique( - std::move(sinks[0]), options, io_detail::single_write_mode::NO, cudf::get_default_stream()); + std::move(sinks[0]), options, io_detail::single_write_mode::NO, stream); } /** diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index f7b805b68f5..b385c63e9cd 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -655,6 +655,7 @@ ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_ORCIO_TEST streams/io/orc_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing) ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) diff --git a/cpp/tests/streams/io/orc_test.cpp b/cpp/tests/streams/io/orc_test.cpp new file mode 100644 index 00000000000..929c3697b3b --- /dev/null +++ b/cpp/tests/streams/io/orc_test.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +auto const temp_env = static_cast( + ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment)); + +class ORCTest : public cudf::test::BaseFixture {}; + +template +std::vector> make_uniqueptrs_vector(UniqPtrs&&... uniqptrs) +{ + std::vector> ptrsvec; + (ptrsvec.push_back(std::forward(uniqptrs)), ...); + return ptrsvec; +} + +cudf::table construct_table() +{ + constexpr auto num_rows = 10; + + auto const zeros_iterator = thrust::make_constant_iterator(0); + auto const ones_iterator = thrust::make_constant_iterator(1); + + cudf::test::fixed_width_column_wrapper col0(zeros_iterator, zeros_iterator + num_rows); + cudf::test::fixed_width_column_wrapper col1(zeros_iterator, zeros_iterator + num_rows); + cudf::test::fixed_width_column_wrapper col2(zeros_iterator, zeros_iterator + num_rows); + cudf::test::fixed_width_column_wrapper col3(zeros_iterator, zeros_iterator + num_rows); + cudf::test::fixed_width_column_wrapper col4(zeros_iterator, zeros_iterator + num_rows); + cudf::test::fixed_width_column_wrapper col5(zeros_iterator, zeros_iterator + num_rows); + + cudf::test::fixed_width_column_wrapper col6 = [&ones_iterator, num_rows] { + auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return numeric::decimal128{ones_iterator[i], numeric::scale_type{12}}; + }); + return cudf::test::fixed_width_column_wrapper(col6_data, + col6_data + num_rows); + }(); + + cudf::test::fixed_width_column_wrapper col7 = [&ones_iterator, num_rows] { + auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return numeric::decimal128{ones_iterator[i], numeric::scale_type{-12}}; + }); + return cudf::test::fixed_width_column_wrapper(col7_data, + col7_data + num_rows); + }(); + + cudf::test::lists_column_wrapper col8 = [] { + auto col8_mask = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2); }); + return cudf::test::lists_column_wrapper( + {{1, 1}, {1, 1, 1}, {}, {1}, {1, 1, 1, 1}, {1, 1, 1, 1, 1}, {}, {1, -1}, {}, {-1, -1}}, + col8_mask); + }(); + + cudf::test::structs_column_wrapper col9 = [&ones_iterator] { + auto child_col_mask = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2); }); + cudf::test::fixed_width_column_wrapper child_col( + ones_iterator, ones_iterator + num_rows, child_col_mask); + return cudf::test::structs_column_wrapper{child_col}; + }(); + + cudf::test::strings_column_wrapper col10 = [] { + std::vector col10_data(num_rows, "rapids"); + return cudf::test::strings_column_wrapper(col10_data.begin(), col10_data.end()); + }(); + + auto colsptr = make_uniqueptrs_vector(col0.release(), + col1.release(), + col2.release(), + col3.release(), + col4.release(), + col5.release(), + col6.release(), + col7.release(), + col8.release(), + col9.release(), + col10.release()); + return cudf::table(std::move(colsptr)); +} + +TEST_F(ORCTest, ORCWriter) +{ + auto tab = construct_table(); + auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc"); + cudf::io::orc_writer_options out_opts = + cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, tab); + cudf::io::write_orc(out_opts, cudf::test::get_default_stream()); +} + +TEST_F(ORCTest, ORCReader) +{ + auto tab = construct_table(); + auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc"); + cudf::io::orc_writer_options out_opts = + cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, tab); + cudf::io::write_orc(out_opts, cudf::test::get_default_stream()); + + cudf::io::orc_reader_options read_opts = + cudf::io::orc_reader_options::builder(cudf::io::source_info{{filepath}}); + auto result = cudf::io::read_orc(read_opts, cudf::test::get_default_stream()); + + auto meta = read_orc_metadata(cudf::io::source_info{filepath}); + auto const stats = cudf::io::read_parsed_orc_statistics(cudf::io::source_info{filepath}); +}