From b11fc0ac07cc67e4db6a9d2e203dda0340a1ae5b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 18 Sep 2023 18:26:16 -0700 Subject: [PATCH 1/9] Add pylibcudf scalar and interop functionality and use in cudf --- python/cudf/cudf/_lib/datetime.pyx | 6 +- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 27 +++- python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 8 +- python/cudf/cudf/_lib/pylibcudf/__init__.py | 5 +- python/cudf/cudf/_lib/pylibcudf/interop.pxd | 26 ++++ python/cudf/cudf/_lib/pylibcudf/interop.pyx | 98 +++++++++++++++ python/cudf/cudf/_lib/pylibcudf/scalar.pxd | 32 +++++ python/cudf/cudf/_lib/pylibcudf/scalar.pyx | 119 ++++++++++++++++++ python/cudf/cudf/_lib/scalar.pxd | 10 +- python/cudf/cudf/_lib/scalar.pyx | 80 ++++++++---- 10 files changed, 373 insertions(+), 38 deletions(-) create mode 100644 python/cudf/cudf/_lib/pylibcudf/interop.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/interop.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/scalar.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/scalar.pyx diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index 81949dbaa20..3d96f59c4d6 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from cudf.core.buffer import acquire_spill_lock @@ -10,6 +10,7 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.filling cimport calendrical_month_sequence +from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.types cimport size_type from cudf._lib.scalar cimport DeviceScalar @@ -166,10 +167,11 @@ def date_range(DeviceScalar start, size_type n, offset): + offset.kwds.get("months", 0) ) + cdef const scalar* c_start = start.c_value.get() with nogil: c_result = move(calendrical_month_sequence( n, - start.c_value.get()[0], + c_start[0], months )) return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 0ce42dc43ff..64adb38ace3 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -12,10 +12,35 @@ # the License. # ============================================================================= -set(cython_sources column.pyx copying.pyx gpumemoryview.pyx table.pyx types.pyx utils.pyx) +set(cython_sources column.pyx copying.pyx gpumemoryview.pyx interop.pyx scalar.pyx table.pyx + types.pyx utils.pyx +) set(linked_libraries cudf::cudf) rapids_cython_create_modules( CXX SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf ) + +find_package(Python 3.9 REQUIRED COMPONENTS Interpreter) + +execute_process( + COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())" + OUTPUT_VARIABLE PYARROW_INCLUDE_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +set(targets_using_arrow_headers pylibcudf_interop pylibcudf_scalar) +foreach(target IN LISTS targets_using_arrow_headers) + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach() + +# TODO: Clean up this include when switching to scikit-build-core. See cudf/_lib/CMakeLists.txt for +# more info +find_package(NumPy REQUIRED) +set(targets_using_numpy pylibcudf_interop pylibcudf_scalar) +foreach(target IN LISTS targets_using_numpy) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + # Switch to the line below when we switch back to FindPython.cmake in CMake 3.24. + # target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}") +endforeach() diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index ba7822b0a54..cf0007b9303 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -1,9 +1,13 @@ # Copyright (c) 2023, NVIDIA CORPORATION. # TODO: Verify consistent usage of relative/absolute imports in pylibcudf. -from . cimport copying +# TODO: Cannot import interop because it introduces a build-time pyarrow header +# dependency for everything that cimports pylibcudf. See if there's a way to +# avoid that before polluting the whole package. +from . cimport copying # , interop from .column cimport Column from .gpumemoryview cimport gpumemoryview +from .scalar cimport Scalar from .table cimport Table # TODO: cimport type_id once # https://github.com/cython/cython/issues/5609 is resolved @@ -12,7 +16,9 @@ from .types cimport DataType __all__ = [ "Column", "DataType", + "Scalar", "Table", "copying", "gpumemoryview", + # "interop", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 3edff9a53e8..72b74a57b87 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -1,16 +1,19 @@ # Copyright (c) 2023, NVIDIA CORPORATION. -from . import copying +from . import copying, interop from .column import Column from .gpumemoryview import gpumemoryview +from .scalar import Scalar from .table import Table from .types import DataType, TypeId __all__ = [ "Column", "DataType", + "Scalar", "Table", "TypeId", "copying", "gpumemoryview", + "interop", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/interop.pxd new file mode 100644 index 00000000000..c1268b8ca1a --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pxd @@ -0,0 +1,26 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from pyarrow.lib cimport Scalar as pa_Scalar, Table as pa_Table + +from cudf._lib.cpp.interop cimport column_metadata + +from .scalar cimport Scalar +from .table cimport Table + + +cdef class ColumnMetadata: + cdef public object name + cdef public object children_meta + cdef column_metadata to_c_metadata(self) + +cpdef Table from_arrow( + pa_Table pyarrow_table, +) + +cpdef Scalar from_arrow_scalar( + pa_Scalar pyarrow_scalar, +) + +cpdef pa_Table to_arrow(Table tbl, list metadata) + +cpdef pa_Scalar to_arrow_scalar(Scalar slr, ColumnMetadata metadata) diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx new file mode 100644 index 00000000000..9e6b44b117d --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -0,0 +1,98 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libcpp.memory cimport shared_ptr, unique_ptr +from libcpp.utility cimport move +from libcpp.vector cimport vector +from pyarrow.lib cimport ( + CScalar as pa_CScalar, + CTable as pa_CTable, + Scalar as pa_Scalar, + Table as pa_Table, + pyarrow_unwrap_scalar, + pyarrow_unwrap_table, + pyarrow_wrap_scalar, + pyarrow_wrap_table, +) + +from cudf._lib.cpp.interop cimport ( + column_metadata, + from_arrow as cpp_from_arrow, + to_arrow as cpp_to_arrow, +) +from cudf._lib.cpp.scalar.scalar cimport scalar +from cudf._lib.cpp.table.table cimport table + +from .scalar cimport Scalar +from .table cimport Table + + +cdef class ColumnMetadata: + def __init__(self, name): + self.name = name + self.children_meta = [] + + cdef column_metadata to_c_metadata(self): + """Convert to C++ column_metadata. + + Since this class is mutable and cheap, it is easier to create the C++ + object on the fly rather than have it directly backing the storage for + the Cython class. + """ + cdef column_metadata c_metadata + cdef ColumnMetadata child_meta + c_metadata.name = self.name.encode() + for child_meta in self.children_meta: + c_metadata.children_meta.push_back(child_meta.to_c_metadata()) + return c_metadata + + +cpdef Table from_arrow( + pa_Table pyarrow_table, +): + cdef shared_ptr[pa_CTable] ctable = ( + pyarrow_unwrap_table(pyarrow_table) + ) + cdef unique_ptr[table] c_result + + with nogil: + c_result = move(cpp_from_arrow(ctable.get()[0])) + + return Table.from_libcudf(move(c_result)) + + +cpdef Scalar from_arrow_scalar( + pa_Scalar pyarrow_scalar, +): + cdef shared_ptr[pa_CScalar] cscalar = ( + pyarrow_unwrap_scalar(pyarrow_scalar) + ) + cdef unique_ptr[scalar] c_result + + with nogil: + c_result = move(cpp_from_arrow(cscalar.get()[0])) + + return Scalar.from_libcudf(move(c_result)) + + +cpdef pa_Table to_arrow(Table tbl, list metadata): + cdef shared_ptr[pa_CTable] c_result + cdef vector[column_metadata] c_metadata + cdef ColumnMetadata meta + for meta in metadata: + c_metadata.push_back(meta.to_c_metadata()) + + with nogil: + c_result = move(cpp_to_arrow(tbl.view(), c_metadata)) + + return pyarrow_wrap_table(c_result) + + +cpdef pa_Scalar to_arrow_scalar(Scalar slr, ColumnMetadata metadata): + cdef shared_ptr[pa_CScalar] c_result + cdef column_metadata c_metadata = metadata.to_c_metadata() + + with nogil: + c_result = move(cpp_to_arrow(dereference(slr.c_obj.get()), c_metadata)) + + return pyarrow_wrap_scalar(c_result) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd new file mode 100644 index 00000000000..d20c65f0be0 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd @@ -0,0 +1,32 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.memory cimport unique_ptr + +from rmm._lib.memory_resource cimport DeviceMemoryResource + +from cudf._lib.cpp.scalar.scalar cimport scalar + +from .types cimport DataType + + +cdef class Scalar: + cdef unique_ptr[scalar] c_obj + cdef DataType _data_type + + # Holds a reference to the DeviceMemoryResource used for allocation. + # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is + # needed for deallocation + cdef DeviceMemoryResource mr + + cdef const scalar* get(self) except * + + cpdef DataType type(self) + cpdef bool is_valid(self) + + @staticmethod + cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=*) + + # TODO: Make sure I'm correct to avoid typing the metadata as + # ColumnMetadata, I assume that will cause circular cimport problems + cpdef to_pyarrow_scalar(self, metadata) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx new file mode 100644 index 00000000000..d06bb9adeb9 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx @@ -0,0 +1,119 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +cimport pyarrow.lib +from cython cimport no_gc_clear +from libcpp.memory cimport unique_ptr + +import pyarrow.lib + +from rmm._lib.memory_resource cimport get_current_device_resource + +from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar +from cudf._lib.cpp.wrappers.decimals cimport ( + decimal32, + decimal64, + decimal128, + scale_type, +) + +from .types cimport DataType, type_id + + +# The DeviceMemoryResource attribute could be released prematurely +# by the gc if the DeviceScalar is in a reference cycle. Removing +# the tp_clear function with the no_gc_clear decoration prevents that. +# See https://github.com/rapidsai/rmm/pull/931 for details. +@no_gc_clear +cdef class Scalar: + """A scalar value in device memory.""" + # Unlike for columns, libcudf does not support scalar views. All APIs that + # accept scalar values accept references to the owning object rather than a + # special view type. As a result, pylibcudf.Scalar has a simpler structure + # than pylibcudf.Column because it can be a true wrapper around a libcudf + # column + + def __cinit__(self, *args, **kwargs): + self.mr = get_current_device_resource() + + def __init__(self, pyarrow.lib.Scalar value=None): + # TODO: This case is not something we really want to + # support, but it here for now to ease the transition of + # DeviceScalar. + if value is not None: + raise ValueError("Scalar should be constructed with a factory") + + @staticmethod + def from_pyarrow_scalar(pyarrow.lib.Scalar value, DataType data_type=None): + # Allow passing a dtype, but only for the purpose of decimals for now + + # Need a local import here to avoid a circular dependency because + # from_arrow_scalar returns a Scalar. + from .interop import from_arrow_scalar + + cdef Scalar s = from_arrow_scalar(value) + if s.type().id() != type_id.DECIMAL128: + if data_type is not None: + raise ValueError( + "dtype may not be passed for non-decimal types" + ) + return s + + if data_type is None: + raise ValueError( + "Decimal scalars must be constructed with a dtype" + ) + + cdef type_id tid = data_type.id() + if tid not in (type_id.DECIMAL32, type_id.DECIMAL64, type_id.DECIMAL128): + raise ValueError( + "Decimal scalars may only be cast to decimals" + ) + + if tid == type_id.DECIMAL128: + return s + + if tid == type_id.DECIMAL32: + s.c_obj.reset( + new fixed_point_scalar[decimal32]( + ( s.c_obj.get()).value(), + scale_type(-value.type.scale), + s.c_obj.get().is_valid() + ) + ) + elif tid == type_id.DECIMAL64: + s.c_obj.reset( + new fixed_point_scalar[decimal64]( + ( s.c_obj.get()).value(), + scale_type(-value.type.scale), + s.c_obj.get().is_valid() + ) + ) + return s + + cpdef to_pyarrow_scalar(self, metadata): + from .interop import to_arrow_scalar + return to_arrow_scalar(self, metadata) + + cdef const scalar* get(self) except *: + return self.c_obj.get() + + cpdef DataType type(self): + """The type of data in the column.""" + return self._data_type + + cpdef bool is_valid(self): + """True if the scalar is valid, false if not""" + return self.get().is_valid() + + @staticmethod + cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=None): + """Construct a Scalar object from a libcudf scalar. + + This method is for pylibcudf's functions to use to ingest outputs of + calling libcudf algorithms, and should generally not be needed by users + (even direct pylibcudf Cython users). + """ + cdef Scalar s = Scalar.__new__(Scalar) + s.c_obj.swap(libcudf_scalar) + s._data_type = DataType.from_libcudf(s.get().type()) + return s diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd index 1deed60d67d..ae1d350edc6 100644 --- a/python/cudf/cudf/_lib/scalar.pxd +++ b/python/cudf/cudf/_lib/scalar.pxd @@ -1,20 +1,16 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr from rmm._lib.memory_resource cimport DeviceMemoryResource +from cudf._lib cimport pylibcudf from cudf._lib.cpp.scalar.scalar cimport scalar cdef class DeviceScalar: - cdef unique_ptr[scalar] c_value - - # Holds a reference to the DeviceMemoryResource used for allocation. - # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is - # needed for deallocation - cdef DeviceMemoryResource mr + cdef pylibcudf.Scalar c_value cdef object _dtype diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 5ab286c5701..4dd217b6515 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -1,7 +1,5 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. -cimport cython - import copy import numpy as np @@ -13,17 +11,17 @@ from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move -from rmm._lib.memory_resource cimport get_current_device_resource - import cudf +from cudf._lib import pylibcudf from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES -from cudf.core.dtypes import ListDtype, StructDtype +from cudf.core.dtypes import ( + ListDtype, + StructDtype, + is_list_dtype, + is_struct_dtype, +) from cudf.core.missing import NA, NaT -from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id - -from cudf._lib.interop import from_arrow_scalar, to_arrow_scalar - cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.cpp.scalar.scalar cimport ( duration_scalar, @@ -44,6 +42,7 @@ from cudf._lib.cpp.wrappers.timestamps cimport ( timestamp_s, timestamp_us, ) +from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id def _replace_nested(obj, check, replacement): @@ -61,15 +60,32 @@ def _replace_nested(obj, check, replacement): _replace_nested(v, check, replacement) -# The DeviceMemoryResource attribute could be released prematurely -# by the gc if the DeviceScalar is in a reference cycle. Removing -# the tp_clear function with the no_gc_clear decoration prevents that. -# See https://github.com/rapidsai/rmm/pull/931 for details. -@cython.no_gc_clear +def gather_metadata(dtypes): + # dtypes is a dict mapping names to column dtypes + # This interface is a bit clunky, but it matches libcudf. May want to + # consider better approaches to building up the metadata eventually. + out = [] + for name, dtype in dtypes.items(): + v = pylibcudf.interop.ColumnMetadata(name) + if is_struct_dtype(dtype): + v.children_meta = gather_metadata(dtype.fields) + elif is_list_dtype(dtype): + # Offsets column is unnamed and has no children + v.children_meta.append(pylibcudf.interop.ColumnMetadata("")) + v.children_meta.extend( + gather_metadata({"": dtype.element_type}) + ) + out.append(v) + return out + + cdef class DeviceScalar: + # TODO: I think this should be removable, except that currently the way + # that from_unique_ptr is implemented is probably dereferencing this in an + # invalid state. See what the best way to fix that is. def __cinit__(self, *args, **kwargs): - self.mr = get_current_device_resource() + self.c_value = pylibcudf.Scalar() def __init__(self, value, dtype): """ @@ -85,7 +101,7 @@ cdef class DeviceScalar: dtype : dtype A NumPy dtype. """ - self._dtype = dtype if dtype.kind != 'U' else cudf.dtype('object') + dtype = dtype if dtype.kind != 'U' else cudf.dtype('object') if cudf.utils.utils.is_na_like(value): value = None @@ -108,10 +124,17 @@ cdef class DeviceScalar: pa_scalar = pa.scalar(value, type=pa_type) - # Note: This factory-like behavior in __init__ will be removed when - # migrating to pylibcudf. - cdef DeviceScalar obj = from_arrow_scalar(pa_scalar, self._dtype) - self.c_value.swap(obj.c_value) + data_type = None + if isinstance(dtype, cudf.core.dtypes.DecimalDtype): + tid = pylibcudf.TypeId.DECIMAL128 + if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype): + tid = pylibcudf.TypeId.DECIMAL32 + elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype): + tid = pylibcudf.TypeId.DECIMAL64 + data_type = pylibcudf.DataType(tid, -dtype.scale) + + self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar, data_type) + self._dtype = dtype def _to_host_scalar(self): is_datetime = self.dtype.kind == "M" @@ -119,7 +142,8 @@ cdef class DeviceScalar: null_type = NaT if is_datetime or is_timedelta else NA - ps = to_arrow_scalar(self) + metadata = gather_metadata({"": self.dtype})[0] + ps = self.c_value.to_pyarrow_scalar(metadata) if not ps.is_valid: return null_type @@ -142,6 +166,10 @@ cdef class DeviceScalar: _replace_nested(ret, lambda item: item is None, NA) return ret + # TODO: This is just here for testing and should be removed. + def get(self): + return self.c_value + @property def dtype(self): """ @@ -158,13 +186,13 @@ cdef class DeviceScalar: return self._to_host_scalar() cdef const scalar* get_raw_ptr(self) except *: - return self.c_value.get() + return self.c_value.c_obj.get() cpdef bool is_valid(self): """ Returns if the Scalar is valid or not(i.e., ). """ - return self.get_raw_ptr()[0].is_valid() + return self.c_value.is_valid() def __repr__(self): if cudf.utils.utils.is_na_like(self.value): @@ -183,7 +211,7 @@ cdef class DeviceScalar: cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar) cdef libcudf_types.data_type cdtype - s.c_value = move(ptr) + s.c_value = pylibcudf.Scalar.from_libcudf(move(ptr)) cdtype = s.get_raw_ptr()[0].type() if dtype is not None: @@ -310,9 +338,9 @@ def _create_proxy_nat_scalar(dtype): if dtype.char in 'mM': nat = dtype.type('NaT').astype(dtype) if dtype.type == np.datetime64: - _set_datetime64_from_np_scalar(result.c_value, nat, dtype, True) + _set_datetime64_from_np_scalar(result.c_value.c_obj, nat, dtype, True) elif dtype.type == np.timedelta64: - _set_timedelta64_from_np_scalar(result.c_value, nat, dtype, True) + _set_timedelta64_from_np_scalar(result.c_value.c_obj, nat, dtype, True) return result else: raise TypeError('NAT only valid for datetime and timedelta') From 33de8d448ee2a48c80fe907c5b6609f0edd158a3 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 19 Sep 2023 09:19:34 -0700 Subject: [PATCH 2/9] Some cleanup --- python/cudf/cudf/_lib/scalar.pxd | 3 +++ python/cudf/cudf/_lib/scalar.pyx | 4 ---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd index ae1d350edc6..77733f59c3d 100644 --- a/python/cudf/cudf/_lib/scalar.pxd +++ b/python/cudf/cudf/_lib/scalar.pxd @@ -5,6 +5,9 @@ from libcpp.memory cimport unique_ptr from rmm._lib.memory_resource cimport DeviceMemoryResource +# TODO: Would like to remove this cimport, but it will require some more work +# to excise all C code in scalar.pyx that relies on using the C API of the +# pylibcudf Scalar underlying the DeviceScalar. from cudf._lib cimport pylibcudf from cudf._lib.cpp.scalar.scalar cimport scalar diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 4dd217b6515..2717c361b98 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -166,10 +166,6 @@ cdef class DeviceScalar: _replace_nested(ret, lambda item: item is None, NA) return ret - # TODO: This is just here for testing and should be removed. - def get(self): - return self.c_value - @property def dtype(self): """ From 840200bf8509bca59f8b2b32dffb2c5cca5ecb72 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 19 Sep 2023 12:00:38 -0700 Subject: [PATCH 3/9] Add comment --- python/cudf/cudf/_lib/scalar.pyx | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 2717c361b98..c54db8793e9 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -338,5 +338,17 @@ def _create_proxy_nat_scalar(dtype): elif dtype.type == np.timedelta64: _set_timedelta64_from_np_scalar(result.c_value.c_obj, nat, dtype, True) return result + + # TODO: It should be able to reimplement the above with pylibcudf. + # Currently this doesn't quite seem to work, though, apparently because + # we need a way to create NaT _valid_ scalars but ingesting from + # pyarrow automatically sets them to invalid. Merely setting to valid + # after the fact is insufficient because the underlying memory appears + # to not be initialized. + # nat = pa.scalar(dtype.type('NaT').astype(dtype)) + # result.c_value = pylibcudf.Scalar.from_pyarrow_scalar(nat) + # result.c_value.c_obj.get().set_valid_async(True) + # result._dtype = dtype + # return result else: raise TypeError('NAT only valid for datetime and timedelta') From e609f601a5f28906b5f7d4f401b4ff5e4debcbf2 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 19 Sep 2023 12:02:45 -0700 Subject: [PATCH 4/9] Remove now unnecessary Cython functions --- python/cudf/cudf/_lib/interop.pyx | 95 +------------------------------ 1 file changed, 1 insertion(+), 94 deletions(-) diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx index 639754fc54f..8fd2a409d90 100644 --- a/python/cudf/cudf/_lib/interop.pyx +++ b/python/cudf/cudf/_lib/interop.pyx @@ -4,14 +4,7 @@ from cpython cimport pycapsule from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector -from pyarrow.lib cimport ( - CScalar, - CTable, - pyarrow_unwrap_scalar, - pyarrow_unwrap_table, - pyarrow_wrap_scalar, - pyarrow_wrap_table, -) +from pyarrow.lib cimport CTable, pyarrow_unwrap_table, pyarrow_wrap_table from cudf._lib.cpp.interop cimport ( DLManagedTensor, @@ -21,22 +14,12 @@ from cudf._lib.cpp.interop cimport ( to_arrow as cpp_to_arrow, to_dlpack as cpp_to_dlpack, ) -from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.cpp.types cimport type_id -from cudf._lib.cpp.wrappers.decimals cimport ( - decimal32, - decimal64, - decimal128, - scale_type, -) -from cudf._lib.scalar cimport DeviceScalar from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns from cudf.api.types import is_list_dtype, is_struct_dtype from cudf.core.buffer import acquire_spill_lock -from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype def from_dlpack(dlpack_capsule): @@ -199,79 +182,3 @@ def from_arrow(object input_table): c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0])) return columns_from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def to_arrow_scalar(DeviceScalar source_scalar): - """Convert a scalar to a PyArrow scalar. - - Parameters - ---------- - source_scalar : the scalar to convert - - Returns - ------- - pyarrow.lib.Scalar - """ - cdef vector[column_metadata] cpp_metadata = gather_metadata( - [("", source_scalar.dtype)] - ) - cdef const scalar* source_scalar_ptr = source_scalar.get_raw_ptr() - - cdef shared_ptr[CScalar] cpp_arrow_scalar - with nogil: - cpp_arrow_scalar = cpp_to_arrow( - source_scalar_ptr[0], cpp_metadata[0] - ) - - return pyarrow_wrap_scalar(cpp_arrow_scalar) - - -@acquire_spill_lock() -def from_arrow_scalar(object input_scalar, output_dtype=None): - """Convert from PyArrow scalar to a cudf scalar. - - Parameters - ---------- - input_scalar : PyArrow scalar - output_dtype : output type to cast to, ignored except for decimals - - Returns - ------- - cudf._lib.DeviceScalar - """ - cdef shared_ptr[CScalar] cpp_arrow_scalar = ( - pyarrow_unwrap_scalar(input_scalar) - ) - cdef unique_ptr[scalar] c_result - - with nogil: - c_result = move(cpp_from_arrow(cpp_arrow_scalar.get()[0])) - - cdef type_id ctype = c_result.get().type().id() - if ctype == type_id.DECIMAL128: - if output_dtype is None: - # Decimals must be cast to the cudf dtype of the right width - raise ValueError( - "Decimal scalars must be constructed with a dtype" - ) - - if isinstance(output_dtype, Decimal32Dtype): - c_result.reset( - new fixed_point_scalar[decimal32]( - ( c_result.get()).value(), - scale_type(-input_scalar.type.scale), - c_result.get().is_valid() - ) - ) - elif isinstance(output_dtype, Decimal64Dtype): - c_result.reset( - new fixed_point_scalar[decimal64]( - ( c_result.get()).value(), - scale_type(-input_scalar.type.scale), - c_result.get().is_valid() - ) - ) - # Decimal128Dtype is a no-op, no conversion needed. - - return DeviceScalar.from_unique_ptr(move(c_result), output_dtype) From 7bb36c4fb1c6c9be6ac103133c91b8bebbdffdf1 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 19 Sep 2023 14:38:55 -0700 Subject: [PATCH 5/9] Revert "Add comment" This reverts commit bdfcd244ef806abb1bbf6d4c5d80e246554f0d2c. --- python/cudf/cudf/_lib/scalar.pyx | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index c54db8793e9..2717c361b98 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -338,17 +338,5 @@ def _create_proxy_nat_scalar(dtype): elif dtype.type == np.timedelta64: _set_timedelta64_from_np_scalar(result.c_value.c_obj, nat, dtype, True) return result - - # TODO: It should be able to reimplement the above with pylibcudf. - # Currently this doesn't quite seem to work, though, apparently because - # we need a way to create NaT _valid_ scalars but ingesting from - # pyarrow automatically sets them to invalid. Merely setting to valid - # after the fact is insufficient because the underlying memory appears - # to not be initialized. - # nat = pa.scalar(dtype.type('NaT').astype(dtype)) - # result.c_value = pylibcudf.Scalar.from_pyarrow_scalar(nat) - # result.c_value.c_obj.get().set_valid_async(True) - # result._dtype = dtype - # return result else: raise TypeError('NAT only valid for datetime and timedelta') From d0196a00147a0bb62299e9f0ad304fc4ee4ebd9c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 27 Sep 2023 16:20:49 -0700 Subject: [PATCH 6/9] Address most PR comments --- python/cudf/cudf/_lib/pylibcudf/interop.pxd | 12 +++--- python/cudf/cudf/_lib/pylibcudf/interop.pyx | 43 ++++++++------------- python/cudf/cudf/_lib/pylibcudf/scalar.pxd | 2 +- python/cudf/cudf/_lib/pylibcudf/scalar.pyx | 20 +++++----- python/cudf/cudf/_lib/scalar.pyx | 20 ++++++++-- 5 files changed, 49 insertions(+), 48 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/interop.pxd index c1268b8ca1a..e9c8d44dd2d 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pxd @@ -1,6 +1,6 @@ # Copyright (c) 2023, NVIDIA CORPORATION. -from pyarrow.lib cimport Scalar as pa_Scalar, Table as pa_Table +from pyarrow cimport lib as pa from cudf._lib.cpp.interop cimport column_metadata @@ -11,16 +11,16 @@ from .table cimport Table cdef class ColumnMetadata: cdef public object name cdef public object children_meta - cdef column_metadata to_c_metadata(self) + cdef column_metadata to_libcudf(self) cpdef Table from_arrow( - pa_Table pyarrow_table, + pa.Table pyarrow_table, ) cpdef Scalar from_arrow_scalar( - pa_Scalar pyarrow_scalar, + pa.Scalar pyarrow_scalar, ) -cpdef pa_Table to_arrow(Table tbl, list metadata) +cpdef pa.Table to_arrow(Table tbl, list metadata) -cpdef pa_Scalar to_arrow_scalar(Scalar slr, ColumnMetadata metadata) +cpdef pa.Scalar to_arrow_scalar(Scalar slr, ColumnMetadata metadata) diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx index 9e6b44b117d..c0897dca991 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -4,16 +4,7 @@ from cython.operator cimport dereference from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector -from pyarrow.lib cimport ( - CScalar as pa_CScalar, - CTable as pa_CTable, - Scalar as pa_Scalar, - Table as pa_Table, - pyarrow_unwrap_scalar, - pyarrow_unwrap_table, - pyarrow_wrap_scalar, - pyarrow_wrap_table, -) +from pyarrow cimport lib as pa from cudf._lib.cpp.interop cimport ( column_metadata, @@ -32,7 +23,7 @@ cdef class ColumnMetadata: self.name = name self.children_meta = [] - cdef column_metadata to_c_metadata(self): + cdef column_metadata to_libcudf(self): """Convert to C++ column_metadata. Since this class is mutable and cheap, it is easier to create the C++ @@ -43,15 +34,15 @@ cdef class ColumnMetadata: cdef ColumnMetadata child_meta c_metadata.name = self.name.encode() for child_meta in self.children_meta: - c_metadata.children_meta.push_back(child_meta.to_c_metadata()) + c_metadata.children_meta.push_back(child_meta.to_libcudf()) return c_metadata cpdef Table from_arrow( - pa_Table pyarrow_table, + pa.Table pyarrow_table, ): - cdef shared_ptr[pa_CTable] ctable = ( - pyarrow_unwrap_table(pyarrow_table) + cdef shared_ptr[pa.CTable] ctable = ( + pa.pyarrow_unwrap_table(pyarrow_table) ) cdef unique_ptr[table] c_result @@ -62,10 +53,10 @@ cpdef Table from_arrow( cpdef Scalar from_arrow_scalar( - pa_Scalar pyarrow_scalar, + pa.Scalar pyarrow_scalar, ): - cdef shared_ptr[pa_CScalar] cscalar = ( - pyarrow_unwrap_scalar(pyarrow_scalar) + cdef shared_ptr[pa.CScalar] cscalar = ( + pa.pyarrow_unwrap_scalar(pyarrow_scalar) ) cdef unique_ptr[scalar] c_result @@ -75,24 +66,24 @@ cpdef Scalar from_arrow_scalar( return Scalar.from_libcudf(move(c_result)) -cpdef pa_Table to_arrow(Table tbl, list metadata): - cdef shared_ptr[pa_CTable] c_result +cpdef pa.Table to_arrow(Table tbl, list metadata): + cdef shared_ptr[pa.CTable] c_result cdef vector[column_metadata] c_metadata cdef ColumnMetadata meta for meta in metadata: - c_metadata.push_back(meta.to_c_metadata()) + c_metadata.push_back(meta.to_libcudf()) with nogil: c_result = move(cpp_to_arrow(tbl.view(), c_metadata)) - return pyarrow_wrap_table(c_result) + return pa.pyarrow_wrap_table(c_result) -cpdef pa_Scalar to_arrow_scalar(Scalar slr, ColumnMetadata metadata): - cdef shared_ptr[pa_CScalar] c_result - cdef column_metadata c_metadata = metadata.to_c_metadata() +cpdef pa.Scalar to_arrow_scalar(Scalar slr, ColumnMetadata metadata): + cdef shared_ptr[pa.CScalar] c_result + cdef column_metadata c_metadata = metadata.to_libcudf() with nogil: c_result = move(cpp_to_arrow(dereference(slr.c_obj.get()), c_metadata)) - return pyarrow_wrap_scalar(c_result) + return pa.pyarrow_wrap_scalar(c_result) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd index d20c65f0be0..389ede25cde 100644 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd @@ -29,4 +29,4 @@ cdef class Scalar: # TODO: Make sure I'm correct to avoid typing the metadata as # ColumnMetadata, I assume that will cause circular cimport problems - cpdef to_pyarrow_scalar(self, metadata) + cpdef to_arrow(self, metadata) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx index d06bb9adeb9..71efea54405 100644 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx @@ -20,9 +20,9 @@ from .types cimport DataType, type_id # The DeviceMemoryResource attribute could be released prematurely -# by the gc if the DeviceScalar is in a reference cycle. Removing -# the tp_clear function with the no_gc_clear decoration prevents that. -# See https://github.com/rapidsai/rmm/pull/931 for details. +# by the gc if the Scalar is in a reference cycle. Removing the tp_clear +# function with the no_gc_clear decoration prevents that. See +# https://github.com/rapidsai/rmm/pull/931 for details. @no_gc_clear cdef class Scalar: """A scalar value in device memory.""" @@ -64,13 +64,6 @@ cdef class Scalar: ) cdef type_id tid = data_type.id() - if tid not in (type_id.DECIMAL32, type_id.DECIMAL64, type_id.DECIMAL128): - raise ValueError( - "Decimal scalars may only be cast to decimals" - ) - - if tid == type_id.DECIMAL128: - return s if tid == type_id.DECIMAL32: s.c_obj.reset( @@ -88,9 +81,14 @@ cdef class Scalar: s.c_obj.get().is_valid() ) ) + elif tid != type_id.DECIMAL128: + raise ValueError( + "Decimal scalars may only be cast to decimals" + ) + return s - cpdef to_pyarrow_scalar(self, metadata): + cpdef to_arrow(self, metadata): from .interop import to_arrow_scalar return to_arrow_scalar(self, metadata) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 2717c361b98..7c09f40d67e 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -61,9 +61,21 @@ def _replace_nested(obj, check, replacement): def gather_metadata(dtypes): - # dtypes is a dict mapping names to column dtypes - # This interface is a bit clunky, but it matches libcudf. May want to - # consider better approaches to building up the metadata eventually. + """Convert a dict of dtypes to a list of ColumnMetadata objects. + + The metadata is constructed recursively so that nested types are + represented as nested ColumnMetadata objects. + + Parameters + ---------- + dtypes : dict + A dict mapping column names to dtypes. + + Returns + ------- + List[ColumnMetadata] + A list of ColumnMetadata objects. + """ out = [] for name, dtype in dtypes.items(): v = pylibcudf.interop.ColumnMetadata(name) @@ -143,7 +155,7 @@ cdef class DeviceScalar: null_type = NaT if is_datetime or is_timedelta else NA metadata = gather_metadata({"": self.dtype})[0] - ps = self.c_value.to_pyarrow_scalar(metadata) + ps = self.c_value.to_arrow(metadata) if not ps.is_valid: return null_type From aee6a046e31f775277015e48f9954218bdc97d64 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 27 Sep 2023 16:40:08 -0700 Subject: [PATCH 7/9] Move from/to arrow impls directly into Scalar. --- python/cudf/cudf/_lib/CMakeLists.txt | 8 +++- python/cudf/cudf/_lib/nvtext/CMakeLists.txt | 8 ++++ python/cudf/cudf/_lib/pylibcudf/interop.pxd | 6 --- python/cudf/cudf/_lib/pylibcudf/interop.pyx | 27 ------------ python/cudf/cudf/_lib/pylibcudf/scalar.pxd | 6 +-- python/cudf/cudf/_lib/pylibcudf/scalar.pyx | 42 +++++++++++++------ python/cudf/cudf/_lib/scalar.pyx | 2 +- python/cudf/cudf/_lib/strings/CMakeLists.txt | 10 ++++- .../cudf/_lib/strings/convert/CMakeLists.txt | 10 ++++- .../cudf/_lib/strings/split/CMakeLists.txt | 10 ++++- 10 files changed, 74 insertions(+), 55 deletions(-) diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 947659c290a..1b543b94589 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -107,8 +107,12 @@ if(${PYARROW_RESULT}) message(FATAL_ERROR "Error while trying to obtain pyarrow include directory:\n${PYARROW_ERROR}") endif() -set(targets_using_arrow_headers interop avro csv orc json parquet) -foreach(target IN LISTS targets_using_arrow_headers) +# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts +# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the +# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers. +# These requirements will go away once all scalar-related Cython code is removed from cudf. +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") endforeach() diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt index 515b9c1d6e4..d4e2392ee04 100644 --- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt +++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt @@ -22,3 +22,11 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX nvtext_ ASSOCIATED_TARGETS cudf ) +# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts +# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the +# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers. +# These requirements will go away once all scalar-related Cython code is removed from cudf. +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach() diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/interop.pxd index e9c8d44dd2d..f7bc33d5d96 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pxd @@ -17,10 +17,4 @@ cpdef Table from_arrow( pa.Table pyarrow_table, ) -cpdef Scalar from_arrow_scalar( - pa.Scalar pyarrow_scalar, -) - cpdef pa.Table to_arrow(Table tbl, list metadata) - -cpdef pa.Scalar to_arrow_scalar(Scalar slr, ColumnMetadata metadata) diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx index c0897dca991..f2faeeb9427 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -1,6 +1,5 @@ # Copyright (c) 2023, NVIDIA CORPORATION. -from cython.operator cimport dereference from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector @@ -11,10 +10,8 @@ from cudf._lib.cpp.interop cimport ( from_arrow as cpp_from_arrow, to_arrow as cpp_to_arrow, ) -from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table -from .scalar cimport Scalar from .table cimport Table @@ -52,20 +49,6 @@ cpdef Table from_arrow( return Table.from_libcudf(move(c_result)) -cpdef Scalar from_arrow_scalar( - pa.Scalar pyarrow_scalar, -): - cdef shared_ptr[pa.CScalar] cscalar = ( - pa.pyarrow_unwrap_scalar(pyarrow_scalar) - ) - cdef unique_ptr[scalar] c_result - - with nogil: - c_result = move(cpp_from_arrow(cscalar.get()[0])) - - return Scalar.from_libcudf(move(c_result)) - - cpdef pa.Table to_arrow(Table tbl, list metadata): cdef shared_ptr[pa.CTable] c_result cdef vector[column_metadata] c_metadata @@ -77,13 +60,3 @@ cpdef pa.Table to_arrow(Table tbl, list metadata): c_result = move(cpp_to_arrow(tbl.view(), c_metadata)) return pa.pyarrow_wrap_table(c_result) - - -cpdef pa.Scalar to_arrow_scalar(Scalar slr, ColumnMetadata metadata): - cdef shared_ptr[pa.CScalar] c_result - cdef column_metadata c_metadata = metadata.to_libcudf() - - with nogil: - c_result = move(cpp_to_arrow(dereference(slr.c_obj.get()), c_metadata)) - - return pa.pyarrow_wrap_scalar(c_result) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd index 389ede25cde..09d853d832f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd @@ -2,11 +2,13 @@ from libcpp cimport bool from libcpp.memory cimport unique_ptr +from pyarrow cimport lib as pa from rmm._lib.memory_resource cimport DeviceMemoryResource from cudf._lib.cpp.scalar.scalar cimport scalar +from .interop cimport ColumnMetadata from .types cimport DataType @@ -27,6 +29,4 @@ cdef class Scalar: @staticmethod cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=*) - # TODO: Make sure I'm correct to avoid typing the metadata as - # ColumnMetadata, I assume that will cause circular cimport problems - cpdef to_arrow(self, metadata) + cpdef pa.Scalar to_arrow(self, ColumnMetadata metadata) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx index 71efea54405..04f588bd3e6 100644 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx @@ -1,13 +1,18 @@ # Copyright (c) 2023, NVIDIA CORPORATION. -cimport pyarrow.lib from cython cimport no_gc_clear -from libcpp.memory cimport unique_ptr - -import pyarrow.lib +from cython.operator cimport dereference +from libcpp.memory cimport shared_ptr, unique_ptr +from libcpp.utility cimport move +from pyarrow cimport lib as pa from rmm._lib.memory_resource cimport get_current_device_resource +from cudf._lib.cpp.interop cimport ( + column_metadata, + from_arrow as cpp_from_arrow, + to_arrow as cpp_to_arrow, +) from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar from cudf._lib.cpp.wrappers.decimals cimport ( decimal32, @@ -16,6 +21,7 @@ from cudf._lib.cpp.wrappers.decimals cimport ( scale_type, ) +from .interop cimport ColumnMetadata from .types cimport DataType, type_id @@ -35,7 +41,7 @@ cdef class Scalar: def __cinit__(self, *args, **kwargs): self.mr = get_current_device_resource() - def __init__(self, pyarrow.lib.Scalar value=None): + def __init__(self, pa.Scalar value=None): # TODO: This case is not something we really want to # support, but it here for now to ease the transition of # DeviceScalar. @@ -43,14 +49,19 @@ cdef class Scalar: raise ValueError("Scalar should be constructed with a factory") @staticmethod - def from_pyarrow_scalar(pyarrow.lib.Scalar value, DataType data_type=None): + def from_arrow(pa.Scalar value, DataType data_type=None): # Allow passing a dtype, but only for the purpose of decimals for now - # Need a local import here to avoid a circular dependency because - # from_arrow_scalar returns a Scalar. - from .interop import from_arrow_scalar + cdef shared_ptr[pa.CScalar] cscalar = ( + pa.pyarrow_unwrap_scalar(value) + ) + cdef unique_ptr[scalar] c_result + + with nogil: + c_result = move(cpp_from_arrow(cscalar.get()[0])) + + cdef Scalar s = Scalar.from_libcudf(move(c_result)) - cdef Scalar s = from_arrow_scalar(value) if s.type().id() != type_id.DECIMAL128: if data_type is not None: raise ValueError( @@ -88,9 +99,14 @@ cdef class Scalar: return s - cpdef to_arrow(self, metadata): - from .interop import to_arrow_scalar - return to_arrow_scalar(self, metadata) + cpdef pa.Scalar to_arrow(self, ColumnMetadata metadata): + cdef shared_ptr[pa.CScalar] c_result + cdef column_metadata c_metadata = metadata.to_libcudf() + + with nogil: + c_result = move(cpp_to_arrow(dereference(self.c_obj.get()), c_metadata)) + + return pa.pyarrow_wrap_scalar(c_result) cdef const scalar* get(self) except *: return self.c_obj.get() diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 7c09f40d67e..0b64c75f7b6 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -145,7 +145,7 @@ cdef class DeviceScalar: tid = pylibcudf.TypeId.DECIMAL64 data_type = pylibcudf.DataType(tid, -dtype.scale) - self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar, data_type) + self.c_value = pylibcudf.Scalar.from_arrow(pa_scalar, data_type) self._dtype = dtype def _to_host_scalar(self): diff --git a/python/cudf/cudf/_lib/strings/CMakeLists.txt b/python/cudf/cudf/_lib/strings/CMakeLists.txt index a5e87a456cb..fc11f047ab4 100644 --- a/python/cudf/cudf/_lib/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/strings/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -40,6 +40,14 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf ) +# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts +# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the +# requirement for arrow headers infects all of cudf. That requirement will go away once all +# scalar-related Cython code is removed from cudf. +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach() add_subdirectory(convert) add_subdirectory(split) diff --git a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt index 434f79d3b5f..f55bb1fb780 100644 --- a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt +++ b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -22,3 +22,11 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf ) +# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts +# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the +# requirement for arrow headers infects all of cudf. That requirement will go away once all +# scalar-related Cython code is removed from cudf. +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach() diff --git a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt index 59a22c06e85..2f2063482af 100644 --- a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt +++ b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -20,3 +20,11 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf ) +# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts +# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the +# requirement for arrow headers infects all of cudf. That requirement will go away once all +# scalar-related Cython code is removed from cudf. +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach() From a0aec16614f4a2c98016b05c42486730b469b111 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 27 Sep 2023 16:55:09 -0700 Subject: [PATCH 8/9] Also implement from/to_arrow for Table --- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 4 +- python/cudf/cudf/_lib/pylibcudf/interop.pxd | 11 ----- python/cudf/cudf/_lib/pylibcudf/interop.pyx | 41 +------------------ python/cudf/cudf/_lib/pylibcudf/table.pxd | 3 ++ python/cudf/cudf/_lib/pylibcudf/table.pyx | 33 ++++++++++++++- 5 files changed, 38 insertions(+), 54 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 64adb38ace3..76ac2835628 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -30,7 +30,7 @@ execute_process( OUTPUT_STRIP_TRAILING_WHITESPACE ) -set(targets_using_arrow_headers pylibcudf_interop pylibcudf_scalar) +set(targets_using_arrow_headers pylibcudf_interop pylibcudf_scalar pylibcudf_table) foreach(target IN LISTS targets_using_arrow_headers) target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") endforeach() @@ -38,7 +38,7 @@ endforeach() # TODO: Clean up this include when switching to scikit-build-core. See cudf/_lib/CMakeLists.txt for # more info find_package(NumPy REQUIRED) -set(targets_using_numpy pylibcudf_interop pylibcudf_scalar) +set(targets_using_numpy ${targets_using_arrow_headers} pylibcudf_types) foreach(target IN LISTS targets_using_numpy) target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") # Switch to the line below when we switch back to FindPython.cmake in CMake 3.24. diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/interop.pxd index f7bc33d5d96..3a79e5425d4 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pxd @@ -1,20 +1,9 @@ # Copyright (c) 2023, NVIDIA CORPORATION. -from pyarrow cimport lib as pa - from cudf._lib.cpp.interop cimport column_metadata -from .scalar cimport Scalar -from .table cimport Table - cdef class ColumnMetadata: cdef public object name cdef public object children_meta cdef column_metadata to_libcudf(self) - -cpdef Table from_arrow( - pa.Table pyarrow_table, -) - -cpdef pa.Table to_arrow(Table tbl, list metadata) diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx index f2faeeb9427..0cdca275027 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -1,18 +1,6 @@ # Copyright (c) 2023, NVIDIA CORPORATION. -from libcpp.memory cimport shared_ptr, unique_ptr -from libcpp.utility cimport move -from libcpp.vector cimport vector -from pyarrow cimport lib as pa - -from cudf._lib.cpp.interop cimport ( - column_metadata, - from_arrow as cpp_from_arrow, - to_arrow as cpp_to_arrow, -) -from cudf._lib.cpp.table.table cimport table - -from .table cimport Table +from cudf._lib.cpp.interop cimport column_metadata cdef class ColumnMetadata: @@ -33,30 +21,3 @@ cdef class ColumnMetadata: for child_meta in self.children_meta: c_metadata.children_meta.push_back(child_meta.to_libcudf()) return c_metadata - - -cpdef Table from_arrow( - pa.Table pyarrow_table, -): - cdef shared_ptr[pa.CTable] ctable = ( - pa.pyarrow_unwrap_table(pyarrow_table) - ) - cdef unique_ptr[table] c_result - - with nogil: - c_result = move(cpp_from_arrow(ctable.get()[0])) - - return Table.from_libcudf(move(c_result)) - - -cpdef pa.Table to_arrow(Table tbl, list metadata): - cdef shared_ptr[pa.CTable] c_result - cdef vector[column_metadata] c_metadata - cdef ColumnMetadata meta - for meta in metadata: - c_metadata.push_back(meta.to_libcudf()) - - with nogil: - c_result = move(cpp_to_arrow(tbl.view(), c_metadata)) - - return pa.pyarrow_wrap_table(c_result) diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pxd b/python/cudf/cudf/_lib/pylibcudf/table.pxd index 95f197b13eb..a9e2874232a 100644 --- a/python/cudf/cudf/_lib/pylibcudf/table.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/table.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2023, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from pyarrow cimport lib as pa from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -16,3 +17,5 @@ cdef class Table: cdef Table from_libcudf(unique_ptr[table] libcudf_tbl) cpdef list columns(self) + + cpdef pa.Table to_arrow(self, list metadata) diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/cudf/cudf/_lib/pylibcudf/table.pyx index 720f9815bd6..c41eb82e4a1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/table.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/table.pyx @@ -1,15 +1,22 @@ # Copyright (c) 2023, NVIDIA CORPORATION. from cython.operator cimport dereference -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector +from pyarrow cimport lib as pa from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.interop cimport ( + column_metadata, + from_arrow as cpp_from_arrow, + to_arrow as cpp_to_arrow, +) from cudf._lib.cpp.table.table cimport table from .column cimport Column +from .interop cimport ColumnMetadata cdef class Table: @@ -60,3 +67,27 @@ cdef class Table: cpdef list columns(self): return self._columns + + @staticmethod + def from_arrow(pa.Table pyarrow_table): + cdef shared_ptr[pa.CTable] ctable = ( + pa.pyarrow_unwrap_table(pyarrow_table) + ) + cdef unique_ptr[table] c_result + + with nogil: + c_result = move(cpp_from_arrow(ctable.get()[0])) + + return Table.from_libcudf(move(c_result)) + + cpdef pa.Table to_arrow(self, list metadata): + cdef shared_ptr[pa.CTable] c_result + cdef vector[column_metadata] c_metadata + cdef ColumnMetadata meta + for meta in metadata: + c_metadata.push_back(meta.to_libcudf()) + + with nogil: + c_result = move(cpp_to_arrow(self.view(), c_metadata)) + + return pa.pyarrow_wrap_table(c_result) From b847d6520cfc4c306ee6d5f16df660d1fd3e5e05 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 27 Sep 2023 17:02:38 -0700 Subject: [PATCH 9/9] Expose interop in top-level Cython package --- python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 6 ++---- python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 7 ++----- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 76ac2835628..5185b2d4bb5 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -30,16 +30,14 @@ execute_process( OUTPUT_STRIP_TRAILING_WHITESPACE ) -set(targets_using_arrow_headers pylibcudf_interop pylibcudf_scalar pylibcudf_table) -foreach(target IN LISTS targets_using_arrow_headers) +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") endforeach() # TODO: Clean up this include when switching to scikit-build-core. See cudf/_lib/CMakeLists.txt for # more info find_package(NumPy REQUIRED) -set(targets_using_numpy ${targets_using_arrow_headers} pylibcudf_types) -foreach(target IN LISTS targets_using_numpy) +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") # Switch to the line below when we switch back to FindPython.cmake in CMake 3.24. # target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}") diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index cf0007b9303..7a35854392c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -1,10 +1,7 @@ # Copyright (c) 2023, NVIDIA CORPORATION. # TODO: Verify consistent usage of relative/absolute imports in pylibcudf. -# TODO: Cannot import interop because it introduces a build-time pyarrow header -# dependency for everything that cimports pylibcudf. See if there's a way to -# avoid that before polluting the whole package. -from . cimport copying # , interop +from . cimport copying, interop from .column cimport Column from .gpumemoryview cimport gpumemoryview from .scalar cimport Scalar @@ -20,5 +17,5 @@ __all__ = [ "Table", "copying", "gpumemoryview", - # "interop", + "interop", ]