From b602c0575405020f27038b9b5fc4f43772d669bd Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 31 May 2024 09:46:20 -0700 Subject: [PATCH 01/10] Start migrating I/O to pylibcudf --- python/cudf/cudf/_lib/avro.pyx | 43 +++------- python/cudf/cudf/_lib/csv.pyx | 8 +- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 1 + .../cudf/_lib/pylibcudf/io/CMakeLists.txt | 25 ++++++ .../cudf/cudf/_lib/pylibcudf/io/__init__.pxd | 3 + .../cudf/cudf/_lib/pylibcudf/io/__init__.py | 3 + python/cudf/cudf/_lib/pylibcudf/io/avro.pxd | 12 +++ python/cudf/cudf/_lib/pylibcudf/io/avro.pyx | 58 ++++++++++++++ python/cudf/cudf/_lib/pylibcudf/io/types.pxd | 32 ++++++++ python/cudf/cudf/_lib/pylibcudf/io/types.pyx | 79 +++++++++++++++++++ .../cudf/_lib/pylibcudf/libcudf/io/orc.pxd | 6 +- .../cudf/_lib/pylibcudf/libcudf/io/types.pxd | 59 +++++++------- python/cudf/cudf/_lib/utils.pxd | 1 + python/cudf/cudf/_lib/utils.pyx | 11 +++ 14 files changed, 274 insertions(+), 67 deletions(-) create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/__init__.py create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/avro.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/avro.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/types.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/types.pyx diff --git a/python/cudf/cudf/_lib/avro.pyx b/python/cudf/cudf/_lib/avro.pyx index ae17a5f1ab6..4d1af8523cb 100644 --- a/python/cudf/cudf/_lib/avro.pyx +++ b/python/cudf/cudf/_lib/avro.pyx @@ -1,17 +1,9 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.string cimport string -from libcpp.utility cimport move -from libcpp.vector cimport vector +from cudf._lib.utils cimport data_from_pylibcudf_io -from cudf._lib.io.utils cimport make_source_info -from cudf._lib.pylibcudf.libcudf.io.avro cimport ( - avro_reader_options, - read_avro as libcudf_read_avro, -) -from cudf._lib.pylibcudf.libcudf.io.types cimport table_with_metadata -from cudf._lib.pylibcudf.libcudf.types cimport size_type -from cudf._lib.utils cimport data_from_unique_ptr +import cudf._lib.pylibcudf as plc +from cudf._lib.pylibcudf.io.types import SourceInfo cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1): @@ -31,25 +23,12 @@ cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1): if not isinstance(skip_rows, int) or skip_rows < -1: raise TypeError("skip_rows must be an int >= -1") - cdef vector[string] c_columns - if columns is not None and len(columns) > 0: - c_columns.reserve(len(columns)) - for col in columns: - c_columns.push_back(str(col).encode()) - - cdef avro_reader_options options = move( - avro_reader_options.builder(make_source_info([datasource])) - .columns(c_columns) - .skip_rows( skip_rows) - .num_rows( num_rows) - .build() + return data_from_pylibcudf_io( + plc.io.avro.read_avro( + plc.io.avro.AvroReaderOptions( + SourceInfo([datasource]), + columns, + skip_rows, + num_rows) + ) ) - - cdef table_with_metadata c_result - - with nogil: - c_result = move(libcudf_read_avro(options)) - - names = [info.name.decode() for info in c_result.metadata.schema_info] - - return data_from_unique_ptr(move(c_result.tbl), column_names=names) diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index aa771295607..0b0bbdb2589 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -151,14 +151,14 @@ cdef csv_reader_options make_csv_reader_options( ) if quoting == 1: - c_quoting = quote_style.QUOTE_ALL + c_quoting = quote_style.ALL elif quoting == 2: - c_quoting = quote_style.QUOTE_NONNUMERIC + c_quoting = quote_style.NONNUMERIC elif quoting == 3: - c_quoting = quote_style.QUOTE_NONE + c_quoting = quote_style.NONE else: # Default value - c_quoting = quote_style.QUOTE_MINIMAL + c_quoting = quote_style.MINIMAL cdef csv_reader_options csv_reader_options_c = move( csv_reader_options.builder(c_source_info) diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 7d01671e84f..b57817b3213 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -48,3 +48,4 @@ link_to_pyarrow_headers(pylibcudf_interop) add_subdirectory(libcudf) add_subdirectory(strings) +add_subdirectory(io) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt new file mode 100644 index 00000000000..2cfec101bab --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt @@ -0,0 +1,25 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources avro.pyx types.pyx) + +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf +) + +set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_types) +link_to_pyarrow_headers("${targets_using_arrow_headers}") diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd new file mode 100644 index 00000000000..b704bbe4b04 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd @@ -0,0 +1,3 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . cimport avro, types diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py new file mode 100644 index 00000000000..6f47bb10aa5 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . import avro, types diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd b/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd new file mode 100644 index 00000000000..f6eade1f444 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.pylibcudf.io.types cimport TableWithMetadata +from cudf._lib.pylibcudf.libcudf.io.avro cimport avro_reader_options + + +cdef class AvroReaderOptions: + cdef avro_reader_options avro_opts + +cpdef TableWithMetadata read_avro(AvroReaderOptions options) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx new file mode 100644 index 00000000000..874db3f3ef4 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx @@ -0,0 +1,58 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.string cimport string +from libcpp.utility cimport move +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.io.avro cimport AvroReaderOptions +from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from cudf._lib.pylibcudf.libcudf.io.avro cimport ( + avro_reader_options, + read_avro as cpp_read_avro, +) +from cudf._lib.pylibcudf.libcudf.io.types cimport table_with_metadata +from cudf._lib.pylibcudf.libcudf.types cimport size_type + + +cdef class AvroReaderOptions: + def __init__( + self, + SourceInfo source_info, + list columns, + size_type skip_rows, + size_type num_rows + ): + cdef vector[string] c_columns + if columns is not None and len(columns) > 0: + c_columns.reserve(len(columns)) + for col in columns: + c_columns.push_back(str(col).encode()) + + self.avro_opts = move( + avro_reader_options.builder(source_info.c_obj) + .columns(c_columns) + .skip_rows(skip_rows) + .num_rows(num_rows) + .build() + ) + +cpdef TableWithMetadata read_avro(AvroReaderOptions options): + """ + Reads an Avro dataset into a set of columns. + + Parameters + ---------- + options : AvroReaderOptions + The set of options to pass to the Avro reader. + + Returns + ------- + TableWithMetadata + The Table and its corresponding metadata that was read in. + """ + cdef table_with_metadata c_result + + with nogil: + c_result = move(cpp_read_avro(options.avro_opts)) + + return TableWithMetadata.from_libcudf(c_result) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd new file mode 100644 index 00000000000..675f13fa48d --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd @@ -0,0 +1,32 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.pylibcudf.libcudf.io.types cimport ( + column_encoding, + column_in_metadata, + column_name_info, + compression_type, + dictionary_policy, + io_type, + partition_info, + quote_style, + sink_info, + source_info, + statistics_freq, + table_input_metadata, + table_metadata, + table_with_metadata, +) +from cudf._lib.pylibcudf.table cimport Table + + +cdef class TableWithMetadata: + cdef Table tbl + cdef table_metadata metadata + + @staticmethod + cdef TableWithMetadata from_libcudf(table_with_metadata& tbl) + +cdef class SourceInfo: + cdef source_info c_obj diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx new file mode 100644 index 00000000000..4aaa38f1325 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx @@ -0,0 +1,79 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.string cimport string +from libcpp.utility cimport move +from libcpp.vector cimport vector + +from cudf._lib.pylibcudf.libcudf.io.types cimport ( + host_buffer, + source_info, + table_with_metadata, +) + +import errno +import io +import os + + +cdef class TableWithMetadata: + + @property + def columns(self): + return self.tbl._columns + + @property + def column_names(self): + # TODO: Handle nesting (columns with child columns) + return [col_info.name.decode() for col_info in self.metadata.schema_info] + + @staticmethod + cdef TableWithMetadata from_libcudf(table_with_metadata& tbl_with_meta): + """Create a Python TableWithMetadata from a libcudf table_with_metadata""" + cdef TableWithMetadata out = TableWithMetadata.__new__(TableWithMetadata) + out.tbl = Table.from_libcudf(move(tbl_with_meta.tbl)) + out.metadata = tbl_with_meta.metadata + return out + +cdef class SourceInfo: + + def __init__(self, list sources): + if not sources: + raise ValueError("Need to pass at least one source") + + if isinstance(sources[0], os.PathLike) or isinstance(sources[0], str): + sources = [str(src) for src in sources] + + cdef vector[string] c_files + if isinstance(sources[0], str): + # If source is a file, return source_info where type=FILEPATH + if not all(os.path.isfile(file) for file in sources): + raise FileNotFoundError(errno.ENOENT, + os.strerror(errno.ENOENT), + sources) + + c_files.reserve(len(sources)) + for src in sources: + c_files.push_back( str(src).encode()) + + self.c_obj = move(source_info(c_files)) + return + + # TODO: host_buffer is deprecated API, use host_span instead + cdef vector[host_buffer] c_host_buffers + cdef const unsigned char[::1] c_buffer + cdef bint empty_buffer = False + if isinstance(sources[0], bytes): + empty_buffer = True + for buffer in sources: + if (len(buffer) > 0): + c_buffer = buffer + c_host_buffers.push_back(host_buffer(&c_buffer[0], + c_buffer.shape[0])) + empty_buffer = False + elif isinstance(sources[0], io.BytesIO): + for bio in sources: + c_buffer = bio.getbuffer() # check if empty? + c_host_buffers.push_back(host_buffer(&c_buffer[0], + c_buffer.shape[0])) + + self.c_obj = source_info(c_host_buffers) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd index e553515dfdf..25f91849dea 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd @@ -94,7 +94,9 @@ cdef extern from "cudf/io/orc.hpp" \ orc_writer_options_builder& compression( cudf_io_types.compression_type comp ) except + - orc_writer_options_builder& enable_statistics(bool val) except + + orc_writer_options_builder& enable_statistics( + cudf_io_types.statistics_freq val + ) except + orc_writer_options_builder& stripe_size_bytes(size_t val) except + orc_writer_options_builder& stripe_size_rows(size_type val) except + orc_writer_options_builder& row_index_stride(size_type val) except + @@ -147,7 +149,7 @@ cdef extern from "cudf/io/orc.hpp" \ cudf_io_types.compression_type comp ) except + chunked_orc_writer_options_builder& enable_statistics( - bool val + cudf_io_types.statistics_freq val ) except + orc_writer_options_builder& stripe_size_bytes(size_t val) except + orc_writer_options_builder& stripe_size_rows(size_type val) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd index 38fae1df1e5..27e4cf0ed64 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd @@ -15,50 +15,51 @@ cimport cudf._lib.pylibcudf.libcudf.io.datasource as cudf_io_datasource cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view from cudf._lib.pylibcudf.libcudf.table.table cimport table from cudf._lib.pylibcudf.libcudf.types cimport size_type +from cudf._lib.pylibcudf.libcudf.utilities.host_span cimport host_span cdef extern from "cudf/io/types.hpp" \ namespace "cudf::io" nogil: - ctypedef enum quote_style: - QUOTE_MINIMAL "cudf::io::quote_style::MINIMAL" - QUOTE_ALL "cudf::io::quote_style::ALL" - QUOTE_NONNUMERIC "cudf::io::quote_style::NONNUMERIC" - QUOTE_NONE "cudf::io::quote_style::NONE" - - ctypedef enum compression_type: - NONE "cudf::io::compression_type::NONE" - AUTO "cudf::io::compression_type::AUTO" - SNAPPY "cudf::io::compression_type::SNAPPY" - GZIP "cudf::io::compression_type::GZIP" - BZIP2 "cudf::io::compression_type::BZIP2" - BROTLI "cudf::io::compression_type::BROTLI" - ZIP "cudf::io::compression_type::ZIP" - XZ "cudf::io::compression_type::XZ" - ZLIB "cudf::io::compression_type::ZLIB" - LZ4 "cudf::io::compression_type::LZ4" - LZO "cudf::io::compression_type::LZO" - ZSTD "cudf::io::compression_type::ZSTD" - - ctypedef enum io_type: - FILEPATH "cudf::io::io_type::FILEPATH" - HOST_BUFFER "cudf::io::io_type::HOST_BUFFER" - VOID "cudf::io::io_type::VOID" - USER_IMPLEMENTED "cudf::io::io_type::USER_IMPLEMENTED" - - ctypedef enum statistics_freq: + cpdef enum class quote_style(int32_t): + MINIMAL + ALL + NONNUMERIC + NONE + + cpdef enum class compression_type(int32_t): + NONE + AUTO + SNAPPY + GZIP + BZIP2 + BROTLI + ZIP + XZ + ZLIB + LZ4 + LZO + ZSTD + + cpdef enum class io_type(int32_t): + FILEPATH + HOST_BUFFER + VOID + USER_IMPLEMENTED + + cpdef enum class statistics_freq(int32_t): STATISTICS_NONE = 0, STATISTICS_ROWGROUP = 1, STATISTICS_PAGE = 2, STATISTICS_COLUMN = 3, - ctypedef enum dictionary_policy: + cpdef enum class dictionary_policy(int32_t): NEVER = 0, ADAPTIVE = 1, ALWAYS = 2, cdef extern from "cudf/io/types.hpp" namespace "cudf::io" nogil: - cpdef enum class column_encoding: + cpdef enum class column_encoding(int32_t): USE_DEFAULT = -1 DICTIONARY = 0 PLAIN = 1 diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd index c5a1e7552b9..99850d549a1 100644 --- a/python/cudf/cudf/_lib/utils.pxd +++ b/python/cudf/cudf/_lib/utils.pxd @@ -11,6 +11,7 @@ from cudf._lib.pylibcudf.libcudf.table.table cimport table, table_view cdef data_from_unique_ptr( unique_ptr[table] c_tbl, column_names, index_names=*) cdef data_from_pylibcudf_table(tbl, column_names, index_names=*) +cdef data_from_pylibcudf_io(tbl_with_meta) cdef data_from_table_view( table_view tv, object owner, object column_names, object index_names=*) cdef table_view table_view_from_columns(columns) except * diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 4c4cd48d6ed..5931a2758ab 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -315,6 +315,17 @@ cdef data_from_pylibcudf_table(tbl, column_names, index_names=None): index_names ) +cdef data_from_pylibcudf_io(tbl_with_meta): + """ + Unpacks the TableWithMetadata from libcudf I/O + into a dict of columns and an Index (cuDF format) + """ + return _data_from_columns( + columns = [Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns], + column_names = tbl_with_meta.column_names, + index_names = None + ) + cdef columns_from_table_view( table_view tv, object owners, From 6a1a2e6b5f1337ee6cd73c53a0cc991f3ef2f5b9 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 31 May 2024 11:04:52 -0700 Subject: [PATCH 02/10] add missing declaration --- python/cudf/cudf/_lib/parquet.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index f0eef9be124..ac592cedaac 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -491,7 +491,7 @@ def write_parquet( "Valid values are '1.0' and '2.0'" ) - dict_policy = ( + cdef cudf_io_types.dictionary_policy dict_policy = ( cudf_io_types.dictionary_policy.ADAPTIVE if use_dictionary else cudf_io_types.dictionary_policy.NEVER From b346a14052162fcb1868a3aa203d89514fa00067 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 31 May 2024 16:02:02 -0700 Subject: [PATCH 03/10] clean and add tests --- .../user_guide/api_docs/pylibcudf/index.rst | 9 ++ .../api_docs/pylibcudf/read_avro.rst | 6 + python/cudf/cudf/_lib/avro.pyx | 15 +- python/cudf/cudf/_lib/pylibcudf/io/avro.pxd | 13 +- python/cudf/cudf/_lib/pylibcudf/io/avro.pyx | 57 ++++---- python/cudf/cudf/_lib/pylibcudf/io/types.pxd | 2 +- python/cudf/cudf/_lib/pylibcudf/io/types.pyx | 2 +- .../cudf/cudf/pylibcudf_tests/common/utils.py | 18 +++ python/cudf/cudf/pylibcudf_tests/test_avro.py | 131 ++++++++++++++++++ 9 files changed, 209 insertions(+), 44 deletions(-) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/read_avro.rst create mode 100644 python/cudf/cudf/pylibcudf_tests/test_avro.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 1c1b37e2c37..ca7f640fcf4 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -30,3 +30,12 @@ This page provides API documentation for pylibcudf. table types unary + +I/O Functions +============= + +.. toctree:: + :maxdepth: 1 + :caption: I/O Functions + + avro diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/read_avro.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/read_avro.rst new file mode 100644 index 00000000000..495bd505fdc --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/read_avro.rst @@ -0,0 +1,6 @@ +==== +Avro +==== + +.. automodule:: cudf._lib.pylibcudf.io.avro + :members: diff --git a/python/cudf/cudf/_lib/avro.pyx b/python/cudf/cudf/_lib/avro.pyx index 4d1af8523cb..3c132b22880 100644 --- a/python/cudf/cudf/_lib/avro.pyx +++ b/python/cudf/cudf/_lib/avro.pyx @@ -6,7 +6,7 @@ import cudf._lib.pylibcudf as plc from cudf._lib.pylibcudf.io.types import SourceInfo -cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1): +cpdef read_avro(datasource, columns=None, skip_rows=0, num_rows=-1): """ Cython function to call libcudf read_avro, see `read_avro`. @@ -20,15 +20,14 @@ cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1): if not isinstance(num_rows, int) or num_rows < -1: raise TypeError("num_rows must be an int >= -1") - if not isinstance(skip_rows, int) or skip_rows < -1: - raise TypeError("skip_rows must be an int >= -1") + if not isinstance(skip_rows, int) or skip_rows < 0: + raise TypeError("skip_rows must be an int >= 0") return data_from_pylibcudf_io( plc.io.avro.read_avro( - plc.io.avro.AvroReaderOptions( - SourceInfo([datasource]), - columns, - skip_rows, - num_rows) + SourceInfo([datasource]), + columns, + skip_rows, + num_rows ) ) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd b/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd index f6eade1f444..00d12264029 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd @@ -2,11 +2,14 @@ from libcpp.memory cimport unique_ptr -from cudf._lib.pylibcudf.io.types cimport TableWithMetadata +from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata from cudf._lib.pylibcudf.libcudf.io.avro cimport avro_reader_options +from cudf._lib.pylibcudf.libcudf.types cimport size_type -cdef class AvroReaderOptions: - cdef avro_reader_options avro_opts - -cpdef TableWithMetadata read_avro(AvroReaderOptions options) +cpdef TableWithMetadata read_avro( + SourceInfo source_info, + list columns = *, + size_type skip_rows = *, + size_type num_rows = * +) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx index 874db3f3ef4..283e4023d4e 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx @@ -4,55 +4,54 @@ from libcpp.string cimport string from libcpp.utility cimport move from libcpp.vector cimport vector -from cudf._lib.pylibcudf.io.avro cimport AvroReaderOptions from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata from cudf._lib.pylibcudf.libcudf.io.avro cimport ( avro_reader_options, read_avro as cpp_read_avro, ) -from cudf._lib.pylibcudf.libcudf.io.types cimport table_with_metadata from cudf._lib.pylibcudf.libcudf.types cimport size_type -cdef class AvroReaderOptions: - def __init__( - self, - SourceInfo source_info, - list columns, - size_type skip_rows, - size_type num_rows - ): - cdef vector[string] c_columns - if columns is not None and len(columns) > 0: - c_columns.reserve(len(columns)) - for col in columns: - c_columns.push_back(str(col).encode()) - - self.avro_opts = move( - avro_reader_options.builder(source_info.c_obj) - .columns(c_columns) - .skip_rows(skip_rows) - .num_rows(num_rows) - .build() - ) - -cpdef TableWithMetadata read_avro(AvroReaderOptions options): +cpdef TableWithMetadata read_avro( + SourceInfo source_info, + list columns = [], # no-cython-lint + size_type skip_rows = 0, + size_type num_rows = -1 +): """ Reads an Avro dataset into a set of columns. Parameters ---------- - options : AvroReaderOptions - The set of options to pass to the Avro reader. + source_info: SourceInfo + The SourceInfo object to read the avro dataset from. + columns: list, default [] + skip_rows: size_type, default 0 + The number of rows to skip. + num_rows: size_type, default -1 + The number of rows to read, after skipping rows. + If -1 is passed, all rows will be read. Returns ------- TableWithMetadata The Table and its corresponding metadata that was read in. """ - cdef table_with_metadata c_result + cdef vector[string] c_columns + if columns is not None and len(columns) > 0: + c_columns.reserve(len(columns)) + for col in columns: + c_columns.push_back(str(col).encode()) + + cdef avro_reader_options avro_opts = move( + avro_reader_options.builder(source_info.c_obj) + .columns(c_columns) + .skip_rows(skip_rows) + .num_rows(num_rows) + .build() + ) with nogil: - c_result = move(cpp_read_avro(options.avro_opts)) + c_result = move(cpp_read_avro(avro_opts)) return TableWithMetadata.from_libcudf(c_result) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd index 675f13fa48d..354d15e5667 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd @@ -22,7 +22,7 @@ from cudf._lib.pylibcudf.table cimport Table cdef class TableWithMetadata: - cdef Table tbl + cdef public Table table cdef table_metadata metadata @staticmethod diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx index 4aaa38f1325..43efa522b5c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx @@ -30,7 +30,7 @@ cdef class TableWithMetadata: cdef TableWithMetadata from_libcudf(table_with_metadata& tbl_with_meta): """Create a Python TableWithMetadata from a libcudf table_with_metadata""" cdef TableWithMetadata out = TableWithMetadata.__new__(TableWithMetadata) - out.tbl = Table.from_libcudf(move(tbl_with_meta.tbl)) + out.table = Table.from_libcudf(move(tbl_with_meta.tbl)) out.metadata = tbl_with_meta.metadata return out diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index 596cd2c92ae..800e29aaff0 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -47,6 +47,24 @@ def assert_table_eq(plc_table: plc.Table, pa_table: pa.Table) -> None: assert_column_eq(plc_col, pa_col) +def assert_table_and_meta_eq( + plc_table_w_meta: plc.io.types.TableWithMetadata, pa_table: pa.Table +) -> None: + """Verify that the pylibcudf TableWithMetadata and PyArrow table are equal""" + + plc_table = plc_table_w_meta.table + + plc_shape = (plc_table.num_rows(), plc_table.num_columns()) + print(plc_shape) + assert plc_shape == pa_table.shape + + for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns): + assert_column_eq(plc_col, pa_col) + + # Check column name equality + assert plc_table_w_meta.column_names == pa_table.column_names + + def cudf_raises(expected_exception: BaseException, *args, **kwargs): # A simple wrapper around pytest.raises that defaults to looking for cudf exceptions match = kwargs.get("match", None) diff --git a/python/cudf/cudf/pylibcudf_tests/test_avro.py b/python/cudf/cudf/pylibcudf_tests/test_avro.py new file mode 100644 index 00000000000..6aa888eab5a --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_avro.py @@ -0,0 +1,131 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import io +import itertools + +import fastavro +import pyarrow as pa +import pytest +from utils import assert_table_and_meta_eq + +import cudf._lib.pylibcudf as plc + +avro_dtype_pairs = [ + ("boolean", pa.bool_()), + ("int", pa.int32()), + ("long", pa.int64()), + ("float", pa.float32()), + ("double", pa.float64()), + ("bytes", pa.string()), + ("string", pa.string()), +] + + +@pytest.fixture( + scope="module", + params=itertools.product(avro_dtype_pairs, avro_dtype_pairs), +) +def avro_dtypes(request): + return request.param + + +@pytest.fixture +def avro_dtype_data(avro_dtypes): + # avro_type1, _, avro_type2, _ = avro_dtypes + types1, types2 = avro_dtypes + avro_type1, _ = types1 + avro_type2, _ = types2 + + def _get_data(avro_type): + if avro_type == "boolean": + return [True, False, True] + elif avro_type in {"int", "long"}: + return [1, 2, -1] + elif avro_type in {"float", "double"}: + return [1.0, 3.1415, -3.1415] + elif avro_type == "bytes": + return [b"a", b"b", b"c"] + elif avro_type == "string": + return ["Hello", "World!", ""] + + return _get_data(avro_type1), _get_data(avro_type2) + + +@pytest.fixture( + params=[ + (0, 0), + (0, -1), + (1, -1), + (3, -1), + ] +) +def row_opts(request): + """ + (skip_rows, num_rows) combos for the avro reader + """ + return request.param + + +@pytest.mark.parametrize("columns", [["prop1"], [], ["prop1", "prop2"]]) +@pytest.mark.parametrize("nullable", [True, False]) +def test_read_avro(avro_dtypes, avro_dtype_data, row_opts, columns, nullable): + # avro_type1, expected_type1, avro_type2, expected_type2 = avro_dtypes + + types1, types2 = avro_dtypes + avro_type1, expected_type1 = types1 + avro_type2, expected_type2 = types2 + + avro_type1 = avro_type1 if not nullable else ["null", avro_type1] + avro_type2 = avro_type2 if not nullable else ["null", avro_type2] + + skip_rows, num_rows = row_opts + + schema = fastavro.parse_schema( + { + "type": "record", + "name": "test", + "fields": [ + {"name": "prop1", "type": avro_type1}, + {"name": "prop2", "type": avro_type2}, + ], + } + ) + + if nullable: + avro_dtype_data = ( + avro_dtype_data[0] + [None], + avro_dtype_data[1] + [None], + ) + + records = [ + {"prop1": val1, "prop2": val2} for val1, val2 in zip(*avro_dtype_data) + ] + + buffer = io.BytesIO() + fastavro.writer(buffer, schema, records) + buffer.seek(0) + + res = plc.io.avro.read_avro( + plc.io.types.SourceInfo([buffer]), + columns=columns, + skip_rows=skip_rows, + num_rows=num_rows, + ) + + expected = pa.Table.from_arrays( + [ + pa.array(avro_dtype_data[0], type=expected_type1), + pa.array(avro_dtype_data[1], type=expected_type2), + ], + names=["prop1", "prop2"], + ) + + # Adjust for skip_rows/num_rows in result + length = num_rows if num_rows != -1 else None + expected = expected.slice(skip_rows, length=length) + + # adjust for # of columns + if columns != []: + expected = expected.select(columns) + + assert_table_and_meta_eq(res, expected) From 3fdc38bcaf74814367ea9de3b99eef80f1c77d2a Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Fri, 31 May 2024 16:59:08 -0700 Subject: [PATCH 04/10] fix typo --- python/cudf/cudf/_lib/pylibcudf/io/types.pxd | 2 +- python/cudf/cudf/_lib/pylibcudf/io/types.pyx | 2 +- python/cudf/cudf/pylibcudf_tests/common/utils.py | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd index 354d15e5667..87c74b73ca3 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd @@ -22,7 +22,7 @@ from cudf._lib.pylibcudf.table cimport Table cdef class TableWithMetadata: - cdef public Table table + cdef public Table tbl cdef table_metadata metadata @staticmethod diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx index 43efa522b5c..4aaa38f1325 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx @@ -30,7 +30,7 @@ cdef class TableWithMetadata: cdef TableWithMetadata from_libcudf(table_with_metadata& tbl_with_meta): """Create a Python TableWithMetadata from a libcudf table_with_metadata""" cdef TableWithMetadata out = TableWithMetadata.__new__(TableWithMetadata) - out.table = Table.from_libcudf(move(tbl_with_meta.tbl)) + out.tbl = Table.from_libcudf(move(tbl_with_meta.tbl)) out.metadata = tbl_with_meta.metadata return out diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index 800e29aaff0..9334a4f1682 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -52,10 +52,9 @@ def assert_table_and_meta_eq( ) -> None: """Verify that the pylibcudf TableWithMetadata and PyArrow table are equal""" - plc_table = plc_table_w_meta.table + plc_table = plc_table_w_meta.tbl plc_shape = (plc_table.num_rows(), plc_table.num_columns()) - print(plc_shape) assert plc_shape == pa_table.shape for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns): From 20e84db96cf27ec3d8cf8f9e89a30b241fd0765e Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Mon, 3 Jun 2024 19:33:14 +0000 Subject: [PATCH 05/10] update --- .../user_guide/api_docs/pylibcudf/index.rst | 10 +-- .../pylibcudf/{read_avro.rst => io/avro.rst} | 0 .../api_docs/pylibcudf/io/index.rst | 18 +++++ .../cudf/cudf/_lib/pylibcudf/io/__init__.pxd | 1 + .../cudf/cudf/_lib/pylibcudf/io/__init__.py | 1 + python/cudf/cudf/_lib/pylibcudf/io/avro.pyx | 2 + python/cudf/cudf/_lib/pylibcudf/io/types.pyx | 45 +++++++++--- .../cudf/_lib/pylibcudf/libcudf/io/types.pxd | 1 - .../cudf/pylibcudf_tests/test_source_info.py | 69 +++++++++++++++++++ 9 files changed, 128 insertions(+), 19 deletions(-) rename docs/cudf/source/user_guide/api_docs/pylibcudf/{read_avro.rst => io/avro.rst} (100%) create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst create mode 100644 python/cudf/cudf/pylibcudf_tests/test_source_info.py diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index ca7f640fcf4..fcdeba2d2f6 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -16,6 +16,7 @@ This page provides API documentation for pylibcudf. filling gpumemoryview groupby + io/index.rst join lists merge @@ -30,12 +31,3 @@ This page provides API documentation for pylibcudf. table types unary - -I/O Functions -============= - -.. toctree:: - :maxdepth: 1 - :caption: I/O Functions - - avro diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/read_avro.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst similarity index 100% rename from docs/cudf/source/user_guide/api_docs/pylibcudf/read_avro.rst rename to docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst new file mode 100644 index 00000000000..0d53ac92db9 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst @@ -0,0 +1,18 @@ +=== +I/O +=== + +I/O Utility Classes +=================== + +.. automodule:: cudf._lib.pylibcudf.io.types + :members: + + +I/O Functions +============= + +.. toctree:: + :maxdepth: 1 + + avro diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd index b704bbe4b04..250292746c1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd @@ -1,3 +1,4 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from . cimport avro, types +from .types cimport SourceInfo, TableWithMetadata diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py index 6f47bb10aa5..5242c741911 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py @@ -1,3 +1,4 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from . import avro, types +from .types import SourceInfo, TableWithMetadata diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx index 283e4023d4e..c4bab4c0896 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx @@ -21,6 +21,8 @@ cpdef TableWithMetadata read_avro( """ Reads an Avro dataset into a set of columns. + For details, see :cpp:class:`cudf::io::read_avro`. + Parameters ---------- source_info: SourceInfo diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx index 4aaa38f1325..527c4bd5ba0 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx @@ -16,13 +16,24 @@ import os cdef class TableWithMetadata: + """A container holding a table and its associated metadata + (e.g. column names) + + For details, see :cpp:class:`cudf::io::table_with_metadata`. + """ @property def columns(self): + """ + Return a list containing the columns of the table + """ return self.tbl._columns @property def column_names(self): + """ + Return a list containing the column names of the table + """ # TODO: Handle nesting (columns with child columns) return [col_info.name.decode() for col_info in self.metadata.schema_info] @@ -35,24 +46,36 @@ cdef class TableWithMetadata: return out cdef class SourceInfo: + """A class containing details on a source to read from. + + For details, see :cpp:class:`cudf::io::source_info`. + + Parameters + ---------- + sources : List[Union[str, bytes, io.BytesIO]] + A homogeneous list of sources (this can be a string filename, + bytes, or an io.BytesIO) to read from. + + Mixing different types of sources will raise a `ValueError`. + """ def __init__(self, list sources): if not sources: raise ValueError("Need to pass at least one source") - if isinstance(sources[0], os.PathLike) or isinstance(sources[0], str): - sources = [str(src) for src in sources] - cdef vector[string] c_files - if isinstance(sources[0], str): - # If source is a file, return source_info where type=FILEPATH - if not all(os.path.isfile(file) for file in sources): - raise FileNotFoundError(errno.ENOENT, - os.strerror(errno.ENOENT), - sources) + if isinstance(sources[0], (os.PathLike, str)): c_files.reserve(len(sources)) + for src in sources: + if not isinstance(src, (os.PathLike, str)): + raise ValueError("All sources must be of the same type!") + if not os.path.isfile(src): + raise FileNotFoundError(errno.ENOENT, + os.strerror(errno.ENOENT), + src) + c_files.push_back( str(src).encode()) self.c_obj = move(source_info(c_files)) @@ -65,6 +88,8 @@ cdef class SourceInfo: if isinstance(sources[0], bytes): empty_buffer = True for buffer in sources: + if not isinstance(buffer, bytes): + raise ValueError("All sources must be of the same type!") if (len(buffer) > 0): c_buffer = buffer c_host_buffers.push_back(host_buffer(&c_buffer[0], @@ -72,6 +97,8 @@ cdef class SourceInfo: empty_buffer = False elif isinstance(sources[0], io.BytesIO): for bio in sources: + if not isinstance(bio, io.BytesIO): + raise ValueError("All sources must be of the same type!") c_buffer = bio.getbuffer() # check if empty? c_host_buffers.push_back(host_buffer(&c_buffer[0], c_buffer.shape[0])) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd index 27e4cf0ed64..8d87deb1472 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd @@ -15,7 +15,6 @@ cimport cudf._lib.pylibcudf.libcudf.io.datasource as cudf_io_datasource cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view from cudf._lib.pylibcudf.libcudf.table.table cimport table from cudf._lib.pylibcudf.libcudf.types cimport size_type -from cudf._lib.pylibcudf.libcudf.utilities.host_span cimport host_span cdef extern from "cudf/io/types.hpp" \ diff --git a/python/cudf/cudf/pylibcudf_tests/test_source_info.py b/python/cudf/cudf/pylibcudf_tests/test_source_info.py new file mode 100644 index 00000000000..71a3ecbcc30 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_source_info.py @@ -0,0 +1,69 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import io + +import pytest + +import cudf._lib.pylibcudf as plc + + +@pytest.mark.parametrize( + "source", ["a.txt", b"hello world", io.BytesIO(b"hello world")] +) +def test_source_info_ctor(source, tmp_path): + if isinstance(source, str): + file = tmp_path / source + file.write_bytes("hello world".encode("utf-8")) + source = str(file) + + plc.io.SourceInfo([source]) + + # TODO: test contents of source_info buffer is correct + # once buffers are exposed on python side + + +@pytest.mark.parametrize( + "sources", + [ + ["a.txt", "a.txt"], + [b"hello world", b"hello there"], + [io.BytesIO(b"hello world"), io.BytesIO(b"hello there")], + ], +) +def test_source_info_ctor_multiple(sources, tmp_path): + for i in range(len(sources)): + source = sources[i] + if isinstance(source, str): + file = tmp_path / source + file.write_bytes("hello world".encode("utf-8")) + sources[i] = str(file) + + plc.io.SourceInfo(sources) + + # TODO: test contents of source_info buffer is correct + # once buffers are exposed on python side + + +@pytest.mark.parametrize( + "sources", + [ + ["awef.txt", b"hello world", io.BytesIO(b"hello world")], + [b"hello world", b"hello there", "awef.txt"], + [ + io.BytesIO(b"hello world"), + io.BytesIO(b"hello there"), + b"hello world", + ], + ], +) +def test_source_info_ctor_mixing_invalid(sources, tmp_path): + # Unlike the previous test + # don't create files so that they are missing + for i in range(len(sources)): + source = sources[i] + if isinstance(source, str): + file = tmp_path / source + file.write_bytes("hello world".encode("utf-8")) + sources[i] = str(file) + with pytest.raises(ValueError): + plc.io.SourceInfo(sources) From 528ba2ca76eb6917e5281dabf8bbf4af6a8c690d Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Tue, 4 Jun 2024 15:01:51 +0000 Subject: [PATCH 06/10] fix docs --- python/cudf/cudf/_lib/pylibcudf/io/avro.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx index c4bab4c0896..70d77715371 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx @@ -21,7 +21,7 @@ cpdef TableWithMetadata read_avro( """ Reads an Avro dataset into a set of columns. - For details, see :cpp:class:`cudf::io::read_avro`. + For details, see :cpp:func:`read_avro`. Parameters ---------- From 48d3e3f15ece4b663324054053897ffbb9c54643 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Tue, 4 Jun 2024 18:27:29 +0000 Subject: [PATCH 07/10] address comments --- python/cudf/cudf/_lib/pylibcudf/io/avro.pyx | 5 +++-- python/cudf/cudf/_lib/pylibcudf/io/types.pyx | 10 +++++++--- python/cudf/cudf/_lib/utils.pyx | 6 +++--- python/cudf/cudf/pylibcudf_tests/test_avro.py | 14 +++----------- 4 files changed, 16 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx index 70d77715371..54396caa949 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx @@ -14,7 +14,7 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type cpdef TableWithMetadata read_avro( SourceInfo source_info, - list columns = [], # no-cython-lint + list columns = None, size_type skip_rows = 0, size_type num_rows = -1 ): @@ -27,7 +27,8 @@ cpdef TableWithMetadata read_avro( ---------- source_info: SourceInfo The SourceInfo object to read the avro dataset from. - columns: list, default [] + columns: list, default None + Optional columns to read, if not provided, reads all columns in the file. skip_rows: size_type, default 0 The number of rows to skip. num_rows: size_type, default -1 diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx index 527c4bd5ba0..2217febb13b 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx @@ -27,15 +27,19 @@ cdef class TableWithMetadata: """ Return a list containing the columns of the table """ - return self.tbl._columns + return self.tbl.columns() @property def column_names(self): """ Return a list containing the column names of the table """ - # TODO: Handle nesting (columns with child columns) - return [col_info.name.decode() for col_info in self.metadata.schema_info] + cdef list names = [] + for col_info in self.metadata.schema_info: + # TODO: Handle nesting (columns with child columns) + assert col_info.children.size() == 0, "Child column names are not handled!" + names.append(col_info.name.decode()) + return names @staticmethod cdef TableWithMetadata from_libcudf(table_with_metadata& tbl_with_meta): diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 5931a2758ab..de6b9f690b6 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -321,9 +321,9 @@ cdef data_from_pylibcudf_io(tbl_with_meta): into a dict of columns and an Index (cuDF format) """ return _data_from_columns( - columns = [Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns], - column_names = tbl_with_meta.column_names, - index_names = None + columns=[Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns], + column_names=tbl_with_meta.column_names, + index_names=None ) cdef columns_from_table_view( diff --git a/python/cudf/cudf/pylibcudf_tests/test_avro.py b/python/cudf/cudf/pylibcudf_tests/test_avro.py index 6aa888eab5a..d6cd86768cd 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_avro.py +++ b/python/cudf/cudf/pylibcudf_tests/test_avro.py @@ -22,8 +22,7 @@ @pytest.fixture( - scope="module", - params=itertools.product(avro_dtype_pairs, avro_dtype_pairs), + scope="module", params=itertools.combinations(avro_dtype_pairs, 2) ) def avro_dtypes(request): return request.param @@ -31,10 +30,7 @@ def avro_dtypes(request): @pytest.fixture def avro_dtype_data(avro_dtypes): - # avro_type1, _, avro_type2, _ = avro_dtypes - types1, types2 = avro_dtypes - avro_type1, _ = types1 - avro_type2, _ = types2 + (avro_type1, _), (avro_type2, _) = avro_dtypes def _get_data(avro_type): if avro_type == "boolean": @@ -69,11 +65,7 @@ def row_opts(request): @pytest.mark.parametrize("columns", [["prop1"], [], ["prop1", "prop2"]]) @pytest.mark.parametrize("nullable", [True, False]) def test_read_avro(avro_dtypes, avro_dtype_data, row_opts, columns, nullable): - # avro_type1, expected_type1, avro_type2, expected_type2 = avro_dtypes - - types1, types2 = avro_dtypes - avro_type1, expected_type1 = types1 - avro_type2, expected_type2 = types2 + (avro_type1, expected_type1), (avro_type2, expected_type2) = avro_dtypes avro_type1 = avro_type1 if not nullable else ["null", avro_type1] avro_type2 = avro_type2 if not nullable else ["null", avro_type2] From 3dfba7cdf201be01055845d49cf2cda0d5c5799f Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 5 Jun 2024 00:16:03 +0000 Subject: [PATCH 08/10] clean imports --- python/cudf/cudf/_lib/pylibcudf/io/avro.pxd | 3 --- python/cudf/cudf/_lib/pylibcudf/io/avro.pyx | 2 +- python/cudf/cudf/_lib/pylibcudf/io/types.pxd | 3 --- 3 files changed, 1 insertion(+), 7 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd b/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd index 00d12264029..3695f36a6e7 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd @@ -1,7 +1,4 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr - from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata from cudf._lib.pylibcudf.libcudf.io.avro cimport avro_reader_options from cudf._lib.pylibcudf.libcudf.types cimport size_type diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx index 54396caa949..89d55d161bd 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx @@ -21,7 +21,7 @@ cpdef TableWithMetadata read_avro( """ Reads an Avro dataset into a set of columns. - For details, see :cpp:func:`read_avro`. + For details, see :cpp:func:`cudf::io::read_avro`. Parameters ---------- diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd index 87c74b73ca3..aa846a47343 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd @@ -1,7 +1,4 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr - from cudf._lib.pylibcudf.libcudf.io.types cimport ( column_encoding, column_in_metadata, From bf081fca0a079526110ad51e603c9cb543b50b32 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 5 Jun 2024 18:37:21 +0000 Subject: [PATCH 09/10] fix docs --- docs/cudf/source/libcudf_docs/api_docs/io_readers.rst | 2 +- python/cudf/cudf/_lib/pylibcudf/io/avro.pyx | 2 +- python/cudf/cudf/_lib/pylibcudf/io/types.pyx | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst b/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst index a835673dee4..f94a5ddb403 100644 --- a/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst +++ b/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst @@ -2,4 +2,4 @@ Io Readers ========== .. doxygengroup:: io_readers - :desc-only: + :members: diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx index 89d55d161bd..54396caa949 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx @@ -21,7 +21,7 @@ cpdef TableWithMetadata read_avro( """ Reads an Avro dataset into a set of columns. - For details, see :cpp:func:`cudf::io::read_avro`. + For details, see :cpp:func:`read_avro`. Parameters ---------- diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx index 2217febb13b..cd777232b33 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx @@ -56,9 +56,9 @@ cdef class SourceInfo: Parameters ---------- - sources : List[Union[str, bytes, io.BytesIO]] + sources : List[Union[str, os.PathLike, bytes, io.BytesIO]] A homogeneous list of sources (this can be a string filename, - bytes, or an io.BytesIO) to read from. + an os.PathLike, bytes, or an io.BytesIO) to read from. Mixing different types of sources will raise a `ValueError`. """ From 1ec568df31746f4b822632b4df5d60cc3cc5b103 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 5 Jun 2024 22:23:51 +0000 Subject: [PATCH 10/10] remove the reference --- docs/cudf/source/libcudf_docs/api_docs/io_readers.rst | 2 +- python/cudf/cudf/_lib/pylibcudf/io/avro.pyx | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst b/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst index f94a5ddb403..a835673dee4 100644 --- a/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst +++ b/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst @@ -2,4 +2,4 @@ Io Readers ========== .. doxygengroup:: io_readers - :members: + :desc-only: diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx index 54396caa949..946e0896fc8 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx @@ -21,8 +21,6 @@ cpdef TableWithMetadata read_avro( """ Reads an Avro dataset into a set of columns. - For details, see :cpp:func:`read_avro`. - Parameters ---------- source_info: SourceInfo