diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp index 6d4c8ec9b8c..cf3e94029be 100644 --- a/cpp/include/cudf/io/data_sink.hpp +++ b/cpp/include/cudf/io/data_sink.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,6 +38,7 @@ class data_sink { * @brief Create a sink from a file path * * @param[in] filepath Path to the file to use + * @return Constructed data_sink object */ static std::unique_ptr create(const std::string& filepath); @@ -45,6 +46,7 @@ class data_sink { * @brief Create a sink from a std::vector * * @param[in,out] buffer Pointer to the output vector + * @return Constructed data_sink object */ static std::unique_ptr create(std::vector* buffer); @@ -54,6 +56,7 @@ class data_sink { * A useful code path for benchmarking, to eliminate physical * hardware randomness from profiling. * + * @return Constructed data_sink object */ static std::unique_ptr create(); @@ -66,6 +69,7 @@ class data_sink { * class that wraps the user pointer. The principle is to allow the user to declare * a custom sink instance and use it across multiple write() calls. * + * @return Constructed data_sink object */ static std::unique_ptr create(cudf::io::data_sink* const user_sink); @@ -73,6 +77,7 @@ class data_sink { * @brief Creates a vector of data sinks, one per element in the input vector. * * @param[in] args vector of parameters + * @return Constructed vector of data sinks */ template static std::vector> create(std::vector const& args) @@ -91,7 +96,7 @@ class data_sink { virtual ~data_sink(){}; /** - * @brief Append the buffer content to the sink + * @pure @brief Append the buffer content to the sink * * @param[in] data Pointer to the buffer to be written into the sink object * @param[in] size Number of bytes to write @@ -118,7 +123,7 @@ class data_sink { * instead of write() when possible. However, it is still possible to receive * write() calls as well. * - * @return bool If this writer supports device_write() calls. + * @return bool If this writer supports device_write() calls */ [[nodiscard]] virtual bool supports_device_write() const { return false; } @@ -172,6 +177,7 @@ class data_sink { * @param gpu_data Pointer to the buffer to be written into the sink object * @param size Number of bytes to write * @param stream CUDA stream to use + * @return a future that can be used to synchronize the call */ virtual std::future device_write_async(void const* gpu_data, size_t size, @@ -181,12 +187,12 @@ class data_sink { } /** - * @brief Flush the data written into the sink + * @pure @brief Flush the data written into the sink */ virtual void flush() = 0; /** - * @brief Returns the total number of bytes written into this sink + * @pure @brief Returns the total number of bytes written into this sink * * @return size_t Total number of bytes written into this sink */ diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index 18ab8aad088..907830de2bb 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -42,6 +42,8 @@ namespace io { */ class datasource { public: + template + class owning_buffer; // forward declaration /** * @brief Interface class for buffers that the datasource returns to the caller. * @@ -50,12 +52,16 @@ class datasource { class buffer { public: /** - * @brief Returns the buffer size in bytes. + * @pure @brief Returns the buffer size in bytes. + * + * @return Buffer size in bytes */ [[nodiscard]] virtual size_t size() const = 0; /** - * @brief Returns the address of the data in the buffer. + * @pure @brief Returns the address of the data in the buffer. + * + * @return Address of the data in the buffer */ [[nodiscard]] virtual uint8_t const* data() const = 0; @@ -64,8 +70,18 @@ class datasource { */ virtual ~buffer() {} + /** + * @brief Factory to construct a datasource buffer object from a container. + * + * @tparam Container Type of the container to construct the buffer from + * @param data_owner The container to construct the buffer from (ownership is transferred) + * @return Constructed buffer object + */ template - static std::unique_ptr create(Container&& data_owner); + static std::unique_ptr create(Container&& data_owner) + { + return std::make_unique>(std::move(data_owner)); + } }; /** @@ -74,6 +90,7 @@ class datasource { * @param[in] filepath Path to the file to use * @param[in] offset Bytes from the start of the file (the default is zero) * @param[in] size Bytes from the offset; use zero for entire file (the default is zero) + * @return Constructed datasource object */ static std::unique_ptr create(const std::string& filepath, size_t offset = 0, @@ -83,6 +100,7 @@ class datasource { * @brief Creates a source from a memory buffer. * * @param[in] buffer Host buffer object + * @return Constructed datasource object */ static std::unique_ptr create(host_buffer const& buffer); @@ -90,6 +108,7 @@ class datasource { * @brief Creates a source from a from an Arrow file. * * @param[in] arrow_file RandomAccessFile to which the API calls are forwarded + * @return Constructed datasource object */ static std::unique_ptr create( std::shared_ptr arrow_file); @@ -98,6 +117,7 @@ class datasource { * @brief Creates a source from an user implemented datasource object. * * @param[in] source Non-owning pointer to the datasource object + * @return Constructed datasource object */ static std::unique_ptr create(datasource* source); @@ -105,6 +125,7 @@ class datasource { * @brief Creates a vector of datasources, one per element in the input vector. * * @param[in] args vector of parameters + * @return Constructed vector of datasource objects */ template static std::vector> create(std::vector const& args) @@ -262,10 +283,26 @@ class datasource { public: non_owning_buffer() {} + /** + * @brief Construct a new non owning buffer object + * + * @param data The data buffer + * @param size The size of the data buffer + */ non_owning_buffer(uint8_t* data, size_t size) : _data(data), _size(size) {} + /** + * @brief Returns the size of the buffer. + * + * @return The size of the buffer in bytes + */ [[nodiscard]] size_t size() const override { return _size; } + /** + * @brief Returns the pointer to the buffer. + * + * @return Pointer to the buffer + */ [[nodiscard]] uint8_t const* data() const override { return _data; } private: @@ -285,6 +322,8 @@ class datasource { public: /** * @brief Moves the input container into the newly created object. + * + * @param data_owner The container to construct the buffer from (ownership is transferred) */ owning_buffer(Container&& data_owner) : _data(std::move(data_owner)), _data_ptr(_data.data()), _size(_data.size()) @@ -294,14 +333,28 @@ class datasource { /** * @brief Moves the input container into the newly created object, and exposes a subspan of the * buffer. + * + * @param data_owner The container to construct the buffer from (ownership is transferred) + * @param data_ptr Pointer to the start of the subspan + * @param size The size of the subspan */ owning_buffer(Container&& data_owner, uint8_t const* data_ptr, size_t size) : _data(std::move(data_owner)), _data_ptr(data_ptr), _size(size) { } + /** + * @brief Returns the size of the buffer. + * + * @return The size of the buffer in bytes + */ [[nodiscard]] size_t size() const override { return _size; } + /** + * @brief Returns the pointer to the data in the buffer. + * + * @return Pointer to the data in the buffer + */ [[nodiscard]] uint8_t const* data() const override { return static_cast(_data_ptr); @@ -314,12 +367,6 @@ class datasource { }; }; -template -std::unique_ptr datasource::buffer::create(Container&& data_owner) -{ - return std::make_unique>(std::move(data_owner)); -} - /** * @brief Implementation class for reading from an Apache Arrow file. The file * could be a memory-mapped file or other implementation supported by Arrow. @@ -378,6 +425,10 @@ class arrow_io_source : public datasource { /** * @brief Returns a buffer with a subset of data from the `arrow` source. + * + * @param offset The offset in bytes from which to read + * @param size The number of bytes to read + * @return A buffer with the read data */ std::unique_ptr host_read(size_t offset, size_t size) override { @@ -388,6 +439,11 @@ class arrow_io_source : public datasource { /** * @brief Reads a selected range from the `arrow` source into a preallocated buffer. + * + * @param[in] offset The offset in bytes from which to read + * @param[in] size The number of bytes to read + * @param[out] dst The preallocated buffer to read into + * @return The number of bytes read */ size_t host_read(size_t offset, size_t size, uint8_t* dst) override { @@ -398,6 +454,8 @@ class arrow_io_source : public datasource { /** * @brief Returns the size of the data in the `arrow` source. + * + * @return The size of the data in the `arrow` source */ [[nodiscard]] size_t size() const override { diff --git a/cpp/include/cudf/io/text/byte_range_info.hpp b/cpp/include/cudf/io/text/byte_range_info.hpp index cb2d00f0d1f..90ecabc7d0c 100644 --- a/cpp/include/cudf/io/text/byte_range_info.hpp +++ b/cpp/include/cudf/io/text/byte_range_info.hpp @@ -30,21 +30,48 @@ namespace text { */ class byte_range_info { private: - int64_t _offset; - int64_t _size; + int64_t _offset; ///< offset in bytes + int64_t _size; ///< size in bytes public: constexpr byte_range_info() noexcept : _offset(0), _size(0) {} + /** + * @brief Constructs a byte_range_info object + * + * @param offset offset in bytes + * @param size size in bytes + */ constexpr byte_range_info(int64_t offset, int64_t size) : _offset(offset), _size(size) { CUDF_EXPECTS(offset >= 0, "offset must be non-negative"); CUDF_EXPECTS(size >= 0, "size must be non-negative"); } + /** + * @brief Copy constructor + * + * @param other byte_range_info object to copy + */ constexpr byte_range_info(byte_range_info const& other) noexcept = default; + /** + * @brief Copy assignment operator + * + * @param other byte_range_info object to copy + * @return this object after copying + */ constexpr byte_range_info& operator=(byte_range_info const& other) noexcept = default; + /** + * @brief Get the offset in bytes + * + * @return Offset in bytes + */ [[nodiscard]] constexpr int64_t offset() { return _offset; } + /** + * @brief Get the size in bytes + * + * @return Size in bytes + */ [[nodiscard]] constexpr int64_t size() { return _size; } }; diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp index 650f4e7f92e..28204c82780 100644 --- a/cpp/include/cudf/io/text/data_chunk_source.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source.hpp @@ -36,9 +36,24 @@ namespace text { */ class device_data_chunk { public: - virtual ~device_data_chunk() = default; - [[nodiscard]] virtual char const* data() const = 0; - [[nodiscard]] virtual std::size_t size() const = 0; + virtual ~device_data_chunk() = default; + /** + * @pure @brief Returns a pointer to the underlying device data. + * + * @return A pointer to the underlying device data + */ + [[nodiscard]] virtual char const* data() const = 0; + /** + * @pure @brief Returns the size of the underlying device data. + * + * @return The size of the underlying device data + */ + [[nodiscard]] virtual std::size_t size() const = 0; + /** + * @pure @brief Returns a span over the underlying device data. + * + * @return A span over the underlying device data + */ virtual operator device_span() const = 0; }; @@ -53,18 +68,23 @@ class device_data_chunk { */ class data_chunk_reader { public: - virtual ~data_chunk_reader() = default; + virtual ~data_chunk_reader() = default; + /** + * @pure @brief Skips the specified number of bytes in the data source. + * + * @param size The number of bytes to skip + */ virtual void skip_bytes(std::size_t size) = 0; /** - * @brief Get the next chunk of bytes from the data source + * @pure @brief Get the next chunk of bytes from the data source * * Performs any necessary work to read and prepare the underlying data source for consumption as a * view over device memory. Common implementations may read from a file, copy data from host * memory, allocate temporary memory, perform iterative decompression, or even launch device * kernels. * - * @param size number of bytes to read. + * @param size number of bytes to read * @param stream stream to associate allocations or perform work required to obtain chunk * @return a chunk of data up to @p size bytes. May return less than @p size bytes if * reader reaches end of underlying data source. Returned data must be accessed in stream order @@ -80,7 +100,13 @@ class data_chunk_reader { */ class data_chunk_source { public: - virtual ~data_chunk_source() = default; + virtual ~data_chunk_source() = default; + + /** + * @pure @brief Get a reader for the data source. + * + * @return `data_chunk_reader` object for the data source + */ [[nodiscard]] virtual std::unique_ptr create_reader() const = 0; }; diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index bd1e3be838b..96f169bcb7c 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -103,8 +103,13 @@ enum statistics_freq { * cudf columns. */ struct column_name_info { - std::string name; - std::vector children; + std::string name; ///< Column name + std::vector children; ///< Child column names + /** + * @brief Construct a column name info with a name and no children + * + * @param _name Column name + */ column_name_info(std::string const& _name) : name(_name) {} column_name_info() = default; }; @@ -138,8 +143,8 @@ struct table_metadata { * @brief Table with table metadata used by io readers to return the metadata by value */ struct table_with_metadata { - std::unique_ptr tbl; - table_metadata metadata; + std::unique_ptr
tbl; //!< Table + table_metadata metadata; //!< Table metadata }; /** @@ -148,9 +153,15 @@ struct table_with_metadata { * Used to describe buffer input in `source_info` objects. */ struct host_buffer { - const char* data = nullptr; - size_t size = 0; + const char* data = nullptr; //!< Pointer to the buffer + size_t size = 0; //!< Size of the buffer host_buffer() = default; + /** + * @brief Construct a new host buffer object + * + * @param data Pointer to the buffer + * @param size Size of the buffer + */ host_buffer(const char* data, size_t size) : data(data), size(size) {} }; @@ -158,35 +169,94 @@ struct host_buffer { * @brief Source information for read interfaces */ struct source_info { - std::vector> _files; + std::vector> _files; //!< Input files source_info() = default; + /** + * @brief Construct a new source info object for mutiple files + * + * @param file_paths Input files paths + */ explicit source_info(std::vector const& file_paths) : _filepaths(file_paths) {} + + /** + * @brief Construct a new source info object for a single file + * + * @param file_path Single input file + */ explicit source_info(std::string const& file_path) : _filepaths({file_path}) {} + /** + * @brief Construct a new source info object for multiple buffers in host memory + * + * @param host_buffers Input buffers in host memory + */ explicit source_info(std::vector const& host_buffers) : _type(io_type::HOST_BUFFER), _buffers(host_buffers) { } + + /** + * @brief Construct a new source info object for a single buffer + * + * @param host_data Input buffer in host memory + * @param size Size of the buffer + */ explicit source_info(const char* host_data, size_t size) : _type(io_type::HOST_BUFFER), _buffers({{host_data, size}}) { } + /** + * @brief Construct a new source info object for multiple user-implemented sources + * + * @param sources User-implemented input sources + */ explicit source_info(std::vector const& sources) : _type(io_type::USER_IMPLEMENTED), _user_sources(sources) { } + + /** + * @brief Construct a new source info object for a single user-implemented source + * + * @param source Single user-implemented Input source + */ explicit source_info(cudf::io::datasource* source) : _type(io_type::USER_IMPLEMENTED), _user_sources({source}) { } + /** + * @brief Get the type of the input + * + * @return The type of the input + */ [[nodiscard]] auto type() const { return _type; } + /** + * @brief Get the filepaths of the input + * + * @return The filepaths of the input + */ [[nodiscard]] auto const& filepaths() const { return _filepaths; } + /** + * @brief Get the host buffers of the input + * + * @return The host buffers of the input + */ [[nodiscard]] auto const& buffers() const { return _buffers; } + /** + * @brief Get the input files + * + * @return The input files + */ [[nodiscard]] auto const& files() const { return _files; } + /** + * @brief Get the user sources of the input + * + * @return The user sources of the input + */ [[nodiscard]] auto const& user_sources() const { return _user_sources; } private: @@ -201,36 +271,98 @@ struct source_info { */ struct sink_info { sink_info() = default; + /** + * @brief Construct a new sink info object + * + * @param num_sinks Number of sinks + */ sink_info(size_t num_sinks) : _num_sinks(num_sinks) {} + /** + * @brief Construct a new sink info object for multiple files + * + * @param file_paths Output files paths + */ explicit sink_info(std::vector const& file_paths) : _type(io_type::FILEPATH), _num_sinks(file_paths.size()), _filepaths(file_paths) { } + + /** + * @brief Construct a new sink info object for a single file + * + * @param file_path Single output file path + */ explicit sink_info(std::string const& file_path) : _type(io_type::FILEPATH), _filepaths({file_path}) { } + /** + * @brief Construct a new sink info object for multiple host buffers + * + * @param buffers Output host buffers + */ explicit sink_info(std::vector*> const& buffers) : _type(io_type::HOST_BUFFER), _num_sinks(buffers.size()), _buffers(buffers) { } + /** + * @brief Construct a new sink info object for a single host buffer + * + * @param buffer Single output host buffer + */ explicit sink_info(std::vector* buffer) : _type(io_type::HOST_BUFFER), _buffers({buffer}) {} + /** + * @brief Construct a new sink info object for multiple user-implemented sinks + * + * @param user_sinks Output user-implemented sinks + */ explicit sink_info(std::vector const& user_sinks) : _type(io_type::USER_IMPLEMENTED), _num_sinks(user_sinks.size()), _user_sinks(user_sinks) { } + + /** + * @brief Construct a new sink info object for a single user-implemented sink + * + * @param user_sink Single output user-implemented sink + */ explicit sink_info(class cudf::io::data_sink* user_sink) : _type(io_type::USER_IMPLEMENTED), _user_sinks({user_sink}) { } + /** + * @brief Get the type of the input + * + * @return The type of the input + */ [[nodiscard]] auto type() const { return _type; } + /** + * @brief Get the number of sinks + * + * @return The number of sinks + */ [[nodiscard]] auto num_sinks() const { return _num_sinks; } + /** + * @brief Get the filepaths of the input + * + * @return The filepaths of the input + */ [[nodiscard]] auto const& filepaths() const { return _filepaths; } + /** + * @brief Get the host buffers of the input + * + * @return The host buffers of the input + */ [[nodiscard]] auto const& buffers() const { return _buffers; } + /** + * @brief Get the user sinks of the input + * + * @return The user sinks of the input + */ [[nodiscard]] auto const& user_sinks() const { return _user_sinks; } private: @@ -243,6 +375,9 @@ struct sink_info { class table_input_metadata; +/** + * @brief Metadata for a column + */ class column_in_metadata { friend table_input_metadata; std::string _name = ""; @@ -256,10 +391,16 @@ class column_in_metadata { public: column_in_metadata() = default; + /** + * @brief Construct a new column in metadata object + * + * @param name Column name + */ column_in_metadata(std::string_view name) : _name{name} {} /** - * @brief Get the children of this column metadata + * @brief Add the children metadata of this column * + * @param child The children metadata of this column to add * @return this for chaining */ column_in_metadata& add_child(column_in_metadata const& child) @@ -271,6 +412,7 @@ class column_in_metadata { /** * @brief Set the name of this column * + * @param name Name of the column * @return this for chaining */ column_in_metadata& set_name(std::string const& name) @@ -284,7 +426,8 @@ class column_in_metadata { * * Only valid in case of chunked writes. In single writes, this option is ignored. * - * @return column_in_metadata& + * @param nullable Whether this column is nullable + * @return this for chaining */ column_in_metadata& set_nullability(bool nullable) { @@ -362,11 +505,15 @@ class column_in_metadata { /** * @brief Get the name of this column + * + * @return The name of this column */ [[nodiscard]] std::string get_name() const { return _name; } /** * @brief Get whether nullability has been explicitly set for this column. + * + * @return Boolean indicating whether nullability has been explicitly set for this column */ [[nodiscard]] bool is_nullability_defined() const { return _nullable.has_value(); } @@ -374,21 +521,29 @@ class column_in_metadata { * @brief Gets the explicitly set nullability for this column. * @throws If nullability is not explicitly defined for this column. * Check using `is_nullability_defined()` first. + * @return Boolean indicating whether this column is nullable */ [[nodiscard]] bool nullable() const { return _nullable.value(); } /** * @brief If this is the metadata of a list column, returns whether it is to be encoded as a map. + * + * @return Boolean indicating whether this column is to be encoded as a map */ [[nodiscard]] bool is_map() const { return _list_column_is_map; } /** * @brief Get whether to encode this timestamp column using deprecated int96 physical type + * + * @return Boolean indicating whether to encode this timestamp column using deprecated int96 + * physical type */ [[nodiscard]] bool is_enabled_int96_timestamps() const { return _use_int96_timestamp; } /** * @brief Get whether precision has been set for this decimal column + * + * @return Boolean indicating whether precision has been set for this decimal column */ [[nodiscard]] bool is_decimal_precision_set() const { return _decimal_precision.has_value(); } @@ -396,11 +551,14 @@ class column_in_metadata { * @brief Get the decimal precision that was set for this column. * @throws If decimal precision was not set for this column. * Check using `is_decimal_precision_set()` first. + * @return The decimal precision that was set for this column */ [[nodiscard]] uint8_t get_decimal_precision() const { return _decimal_precision.value(); } /** * @brief Get whether parquet field id has been set for this column. + * + * @return Boolean indicating whether parquet field id has been set for this column */ [[nodiscard]] bool is_parquet_field_id_set() const { return _parquet_field_id.has_value(); } @@ -408,15 +566,21 @@ class column_in_metadata { * @brief Get the parquet field id that was set for this column. * @throws If parquet field id was not set for this column. * Check using `is_parquet_field_id_set()` first. + * @return The parquet field id that was set for this column */ [[nodiscard]] int32_t get_parquet_field_id() const { return _parquet_field_id.value(); } /** * @brief Get the number of children of this column + * + * @return The number of children of this column */ [[nodiscard]] size_type num_children() const { return children.size(); } }; +/** + * @brief Metadata for a table + */ class table_input_metadata { public: table_input_metadata() = default; // Required by cython @@ -430,7 +594,7 @@ class table_input_metadata { */ table_input_metadata(table_view const& table); - std::vector column_metadata; + std::vector column_metadata; //!< List of column metadata }; /** @@ -440,10 +604,16 @@ class table_input_metadata { * writing, one partition_info struct defines one partition and corresponds to one output file */ struct partition_info { - size_type start_row; - size_type num_rows; + size_type start_row; //!< The start row of the partition + size_type num_rows; //!< The number of rows in the partition partition_info() = default; + /** + * @brief Construct a new partition_info + * + * @param start_row The start row of the partition + * @param num_rows The number of rows in the partition + */ partition_info(size_type start_row, size_type num_rows) : start_row(start_row), num_rows(num_rows) { }