Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pylibcudf.Scalar that interoperates with Arrow scalars #14133

Merged
merged 11 commits into from
Oct 6, 2023
6 changes: 4 additions & 2 deletions python/cudf/cudf/_lib/datetime.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

from cudf.core.buffer import acquire_spill_lock

Expand All @@ -10,6 +10,7 @@ from cudf._lib.column cimport Column
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.filling cimport calendrical_month_sequence
from cudf._lib.cpp.scalar.scalar cimport scalar
from cudf._lib.cpp.types cimport size_type
from cudf._lib.scalar cimport DeviceScalar

Expand Down Expand Up @@ -166,10 +167,11 @@ def date_range(DeviceScalar start, size_type n, offset):
+ offset.kwds.get("months", 0)
)

cdef const scalar* c_start = start.c_value.get()
with nogil:
c_result = move(calendrical_month_sequence(
n,
start.c_value.get()[0],
c_start[0],
months
))
return Column.from_unique_ptr(move(c_result))
Expand Down
95 changes: 1 addition & 94 deletions python/cudf/cudf/_lib/interop.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,7 @@ from cpython cimport pycapsule
from libcpp.memory cimport shared_ptr, unique_ptr
from libcpp.utility cimport move
from libcpp.vector cimport vector
from pyarrow.lib cimport (
CScalar,
CTable,
pyarrow_unwrap_scalar,
pyarrow_unwrap_table,
pyarrow_wrap_scalar,
pyarrow_wrap_table,
)
from pyarrow.lib cimport CTable, pyarrow_unwrap_table, pyarrow_wrap_table

from cudf._lib.cpp.interop cimport (
DLManagedTensor,
Expand All @@ -21,22 +14,12 @@ from cudf._lib.cpp.interop cimport (
to_arrow as cpp_to_arrow,
to_dlpack as cpp_to_dlpack,
)
from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport type_id
from cudf._lib.cpp.wrappers.decimals cimport (
decimal32,
decimal64,
decimal128,
scale_type,
)
from cudf._lib.scalar cimport DeviceScalar
from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns

from cudf.api.types import is_list_dtype, is_struct_dtype
from cudf.core.buffer import acquire_spill_lock
from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype


def from_dlpack(dlpack_capsule):
Expand Down Expand Up @@ -199,79 +182,3 @@ def from_arrow(object input_table):
c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0]))

return columns_from_unique_ptr(move(c_result))


@acquire_spill_lock()
def to_arrow_scalar(DeviceScalar source_scalar):
"""Convert a scalar to a PyArrow scalar.

Parameters
----------
source_scalar : the scalar to convert

Returns
-------
pyarrow.lib.Scalar
"""
cdef vector[column_metadata] cpp_metadata = gather_metadata(
[("", source_scalar.dtype)]
)
cdef const scalar* source_scalar_ptr = source_scalar.get_raw_ptr()

cdef shared_ptr[CScalar] cpp_arrow_scalar
with nogil:
cpp_arrow_scalar = cpp_to_arrow(
source_scalar_ptr[0], cpp_metadata[0]
)

return pyarrow_wrap_scalar(cpp_arrow_scalar)


@acquire_spill_lock()
def from_arrow_scalar(object input_scalar, output_dtype=None):
"""Convert from PyArrow scalar to a cudf scalar.

Parameters
----------
input_scalar : PyArrow scalar
output_dtype : output type to cast to, ignored except for decimals

Returns
-------
cudf._lib.DeviceScalar
"""
cdef shared_ptr[CScalar] cpp_arrow_scalar = (
pyarrow_unwrap_scalar(input_scalar)
)
cdef unique_ptr[scalar] c_result

with nogil:
c_result = move(cpp_from_arrow(cpp_arrow_scalar.get()[0]))

cdef type_id ctype = c_result.get().type().id()
if ctype == type_id.DECIMAL128:
if output_dtype is None:
# Decimals must be cast to the cudf dtype of the right width
raise ValueError(
"Decimal scalars must be constructed with a dtype"
)

if isinstance(output_dtype, Decimal32Dtype):
c_result.reset(
new fixed_point_scalar[decimal32](
(<fixed_point_scalar[decimal128]*> c_result.get()).value(),
scale_type(-input_scalar.type.scale),
c_result.get().is_valid()
)
)
elif isinstance(output_dtype, Decimal64Dtype):
c_result.reset(
new fixed_point_scalar[decimal64](
(<fixed_point_scalar[decimal128]*> c_result.get()).value(),
scale_type(-input_scalar.type.scale),
c_result.get().is_valid()
)
)
# Decimal128Dtype is a no-op, no conversion needed.

return DeviceScalar.from_unique_ptr(move(c_result), output_dtype)
27 changes: 26 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,35 @@
# the License.
# =============================================================================

set(cython_sources column.pyx copying.pyx gpumemoryview.pyx table.pyx types.pyx utils.pyx)
set(cython_sources column.pyx copying.pyx gpumemoryview.pyx interop.pyx scalar.pyx table.pyx
types.pyx utils.pyx
)
set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
CXX
SOURCE_FILES "${cython_sources}"
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
)

find_package(Python 3.9 REQUIRED COMPONENTS Interpreter)

execute_process(
COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())"
OUTPUT_VARIABLE PYARROW_INCLUDE_DIR
OUTPUT_STRIP_TRAILING_WHITESPACE
)

set(targets_using_arrow_headers pylibcudf_interop pylibcudf_scalar)
foreach(target IN LISTS targets_using_arrow_headers)
target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
endforeach()

# TODO: Clean up this include when switching to scikit-build-core. See cudf/_lib/CMakeLists.txt for
# more info
find_package(NumPy REQUIRED)
set(targets_using_numpy pylibcudf_interop pylibcudf_scalar)
foreach(target IN LISTS targets_using_numpy)
target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
# Switch to the line below when we switch back to FindPython.cmake in CMake 3.24.
# target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}")
endforeach()
8 changes: 7 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
# Copyright (c) 2023, NVIDIA CORPORATION.

# TODO: Verify consistent usage of relative/absolute imports in pylibcudf.
from . cimport copying
# TODO: Cannot import interop because it introduces a build-time pyarrow header
# dependency for everything that cimports pylibcudf. See if there's a way to
# avoid that before polluting the whole package.
from . cimport copying # , interop
wence- marked this conversation as resolved.
Show resolved Hide resolved
from .column cimport Column
from .gpumemoryview cimport gpumemoryview
from .scalar cimport Scalar
from .table cimport Table
# TODO: cimport type_id once
# https://github.com/cython/cython/issues/5609 is resolved
Expand All @@ -12,7 +16,9 @@ from .types cimport DataType
__all__ = [
"Column",
"DataType",
"Scalar",
"Table",
"copying",
"gpumemoryview",
# "interop",
]
5 changes: 4 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
# Copyright (c) 2023, NVIDIA CORPORATION.

from . import copying
from . import copying, interop
from .column import Column
from .gpumemoryview import gpumemoryview
from .scalar import Scalar
from .table import Table
from .types import DataType, TypeId

__all__ = [
"Column",
"DataType",
"Scalar",
"Table",
"TypeId",
"copying",
"gpumemoryview",
"interop",
]
26 changes: 26 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/interop.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Copyright (c) 2023, NVIDIA CORPORATION.

from pyarrow.lib cimport Scalar as pa_Scalar, Table as pa_Table
vyasr marked this conversation as resolved.
Show resolved Hide resolved

from cudf._lib.cpp.interop cimport column_metadata

from .scalar cimport Scalar
from .table cimport Table


cdef class ColumnMetadata:
cdef public object name
cdef public object children_meta
cdef column_metadata to_c_metadata(self)

cpdef Table from_arrow(
pa_Table pyarrow_table,
)

cpdef Scalar from_arrow_scalar(
pa_Scalar pyarrow_scalar,
)

cpdef pa_Table to_arrow(Table tbl, list metadata)

cpdef pa_Scalar to_arrow_scalar(Scalar slr, ColumnMetadata metadata)
98 changes: 98 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/interop.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Copyright (c) 2023, NVIDIA CORPORATION.

from cython.operator cimport dereference
from libcpp.memory cimport shared_ptr, unique_ptr
from libcpp.utility cimport move
from libcpp.vector cimport vector
from pyarrow.lib cimport (
CScalar as pa_CScalar,
CTable as pa_CTable,
Scalar as pa_Scalar,
Table as pa_Table,
pyarrow_unwrap_scalar,
pyarrow_unwrap_table,
pyarrow_wrap_scalar,
pyarrow_wrap_table,
)

from cudf._lib.cpp.interop cimport (
column_metadata,
from_arrow as cpp_from_arrow,
to_arrow as cpp_to_arrow,
)
from cudf._lib.cpp.scalar.scalar cimport scalar
from cudf._lib.cpp.table.table cimport table

from .scalar cimport Scalar
from .table cimport Table


cdef class ColumnMetadata:
def __init__(self, name):
self.name = name
self.children_meta = []

cdef column_metadata to_c_metadata(self):
vyasr marked this conversation as resolved.
Show resolved Hide resolved
"""Convert to C++ column_metadata.

Since this class is mutable and cheap, it is easier to create the C++
object on the fly rather than have it directly backing the storage for
the Cython class.
"""
cdef column_metadata c_metadata
cdef ColumnMetadata child_meta
c_metadata.name = self.name.encode()
for child_meta in self.children_meta:
c_metadata.children_meta.push_back(child_meta.to_c_metadata())
return c_metadata


cpdef Table from_arrow(
pa_Table pyarrow_table,
):
cdef shared_ptr[pa_CTable] ctable = (
pyarrow_unwrap_table(pyarrow_table)
)
cdef unique_ptr[table] c_result

with nogil:
c_result = move(cpp_from_arrow(ctable.get()[0]))

return Table.from_libcudf(move(c_result))


cpdef Scalar from_arrow_scalar(
pa_Scalar pyarrow_scalar,
):
cdef shared_ptr[pa_CScalar] cscalar = (
pyarrow_unwrap_scalar(pyarrow_scalar)
)
cdef unique_ptr[scalar] c_result

with nogil:
c_result = move(cpp_from_arrow(cscalar.get()[0]))

return Scalar.from_libcudf(move(c_result))


cpdef pa_Table to_arrow(Table tbl, list metadata):
cdef shared_ptr[pa_CTable] c_result
cdef vector[column_metadata] c_metadata
cdef ColumnMetadata meta
for meta in metadata:
c_metadata.push_back(meta.to_c_metadata())

with nogil:
c_result = move(cpp_to_arrow(tbl.view(), c_metadata))

return pyarrow_wrap_table(c_result)


cpdef pa_Scalar to_arrow_scalar(Scalar slr, ColumnMetadata metadata):
cdef shared_ptr[pa_CScalar] c_result
cdef column_metadata c_metadata = metadata.to_c_metadata()

with nogil:
c_result = move(cpp_to_arrow(dereference(slr.c_obj.get()), c_metadata))

return pyarrow_wrap_scalar(c_result)
32 changes: 32 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/scalar.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright (c) 2023, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport unique_ptr

from rmm._lib.memory_resource cimport DeviceMemoryResource

from cudf._lib.cpp.scalar.scalar cimport scalar

from .types cimport DataType


cdef class Scalar:
cdef unique_ptr[scalar] c_obj
cdef DataType _data_type

# Holds a reference to the DeviceMemoryResource used for allocation.
# Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is
# needed for deallocation
cdef DeviceMemoryResource mr

cdef const scalar* get(self) except *

cpdef DataType type(self)
cpdef bool is_valid(self)

@staticmethod
cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=*)

# TODO: Make sure I'm correct to avoid typing the metadata as
vyasr marked this conversation as resolved.
Show resolved Hide resolved
# ColumnMetadata, I assume that will cause circular cimport problems
cpdef to_pyarrow_scalar(self, metadata)
Loading