Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure objects with __interface__ are converted to cupy/numpy arrays #16436

Merged
merged 8 commits into from
Aug 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions python/cudf/benchmarks/API/bench_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import string

import numba.cuda
import numpy
import pytest
import pytest_cases
Expand All @@ -16,6 +17,12 @@ def bench_construction(benchmark, N):
benchmark(cudf.DataFrame, {None: cupy.random.rand(N)})


@pytest.mark.parametrize("N", [100, 100_000])
@pytest.mark.pandas_incompatible
def bench_construction_numba_device_array(benchmark, N):
benchmark(cudf.DataFrame, numba.cuda.to_device(numpy.ones((100, N))))


@benchmark_with_object(cls="dataframe", dtype="float", cols=6)
@pytest.mark.parametrize(
"expr", ["a+b", "a+b+c+d+e", "a / (sin(a) + cos(b)) * tanh(d*e*f)"]
Expand Down
3 changes: 1 addition & 2 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1843,8 +1843,7 @@ def as_column(
else:
mask = None

arbitrary = cupy.asarray(arbitrary)
arbitrary = cupy.ascontiguousarray(arbitrary)
arbitrary = cupy.asarray(arbitrary, order="C")

data = as_buffer(arbitrary, exposed=cudf.get_option("copy_on_write"))
col = build_column(data, dtype=arbitrary.dtype, mask=mask)
Expand Down
34 changes: 22 additions & 12 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -782,7 +782,6 @@ def __init__(
)
elif hasattr(data, "__cuda_array_interface__"):
arr_interface = data.__cuda_array_interface__

# descr is an optional field of the _cuda_ary_iface_
if "descr" in arr_interface:
if len(arr_interface["descr"]) == 1:
Expand Down Expand Up @@ -5835,17 +5834,18 @@ def from_records(
@_performance_tracking
def _from_arrays(
cls,
data: np.ndarray | cupy.ndarray,
data,
index=None,
columns=None,
nan_as_null=False,
):
"""Convert a numpy/cupy array to DataFrame.
"""
Convert an object implementing an array interface to DataFrame.

Parameters
----------
data : numpy/cupy array of ndim 1 or 2,
dimensions greater than 2 are not supported yet.
data : object of ndim 1 or 2,
Object implementing ``__array_interface__`` or ``__cuda_array_interface__``
index : Index or array-like
Index to use for resulting frame. Will default to
RangeIndex if no indexing information part of input data and
Expand All @@ -5857,13 +5857,23 @@ def _from_arrays(
-------
DataFrame
"""
if data.ndim != 1 and data.ndim != 2:
array_data: np.ndarray | cupy.ndarray
if hasattr(data, "__cuda_array_interface__"):
array_data = cupy.asarray(data, order="F")
jakirkham marked this conversation as resolved.
Show resolved Hide resolved
elif hasattr(data, "__array_interface__"):
array_data = np.asarray(data, order="F")
else:
raise ValueError(
f"records dimension expected 1 or 2 but found: {data.ndim}"
"data must be an object implementing __cuda_array_interface__ or __array_interface__"
)

if array_data.ndim not in {1, 2}:
raise ValueError(
f"records dimension expected 1 or 2 but found: {array_data.ndim}"
)

if data.ndim == 2:
num_cols = data.shape[1]
num_cols = array_data.shape[1]
else:
# Since we validate ndim to be either 1 or 2 above,
# this case can be assumed to be ndim == 1.
Expand All @@ -5881,14 +5891,14 @@ def _from_arrays(
raise ValueError("Duplicate column names are not allowed")
names = columns

if data.ndim == 2:
if array_data.ndim == 2:
ca_data = {
k: column.as_column(data[:, i], nan_as_null=nan_as_null)
k: column.as_column(array_data[:, i], nan_as_null=nan_as_null)
for i, k in enumerate(names)
}
elif data.ndim == 1:
elif array_data.ndim == 1:
ca_data = {
names[0]: column.as_column(data, nan_as_null=nan_as_null)
names[0]: column.as_column(array_data, nan_as_null=nan_as_null)
}

if index is not None:
Expand Down
Loading