From 45f976db75507ed8b8992c14d590eece1529655d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 30 Jul 2024 11:24:17 -0700 Subject: [PATCH 1/8] Ensure objects with __interface__ are converted to cupy/numpy arrays --- python/cudf/cudf/core/dataframe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1d7136e61e3..fc3535dd60b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -782,7 +782,7 @@ def __init__( ) elif hasattr(data, "__cuda_array_interface__"): arr_interface = data.__cuda_array_interface__ - + data = cupy.asfortranarray(data) # descr is an optional field of the _cuda_ary_iface_ if "descr" in arr_interface: if len(arr_interface["descr"]) == 1: @@ -801,6 +801,7 @@ def __init__( self._check_data_index_length_match() elif hasattr(data, "__array_interface__"): arr_interface = data.__array_interface__ + data = np.asfortranarray(data) if len(arr_interface["descr"]) == 1: # not record arrays new_df = self._from_arrays(data, index=index, columns=columns) From 3be2cbac381ad4c6cbea13f8ded3c3407bbc2300 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 30 Jul 2024 11:26:28 -0700 Subject: [PATCH 2/8] Simplify ascontiguousarray --- python/cudf/cudf/core/column/column.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 32e6aade65b..d35ea6d93eb 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1843,7 +1843,6 @@ def as_column( else: mask = None - arbitrary = cupy.asarray(arbitrary) arbitrary = cupy.ascontiguousarray(arbitrary) data = as_buffer(arbitrary, exposed=cudf.get_option("copy_on_write")) @@ -2036,7 +2035,7 @@ def as_column( check_invalid_array(desc["shape"], np.dtype(desc["typestr"])) # CUDF assumes values are always contiguous - arbitrary = np.asarray(arbitrary, order="C") + arbitrary = np.ascontiguousarray(arbitrary) if arbitrary.ndim == 0: # TODO: Or treat as scalar? From c6b4f8f0439636bcc564b67562203495f7be4383 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 30 Jul 2024 11:38:11 -0700 Subject: [PATCH 3/8] Handle it in _from_array --- python/cudf/cudf/core/dataframe.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index fc3535dd60b..dafc86d33df 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -782,7 +782,6 @@ def __init__( ) elif hasattr(data, "__cuda_array_interface__"): arr_interface = data.__cuda_array_interface__ - data = cupy.asfortranarray(data) # descr is an optional field of the _cuda_ary_iface_ if "descr" in arr_interface: if len(arr_interface["descr"]) == 1: @@ -801,7 +800,6 @@ def __init__( self._check_data_index_length_match() elif hasattr(data, "__array_interface__"): arr_interface = data.__array_interface__ - data = np.asfortranarray(data) if len(arr_interface["descr"]) == 1: # not record arrays new_df = self._from_arrays(data, index=index, columns=columns) @@ -5836,17 +5834,18 @@ def from_records( @_performance_tracking def _from_arrays( cls, - data: np.ndarray | cupy.ndarray, + data, index=None, columns=None, nan_as_null=False, ): - """Convert a numpy/cupy array to DataFrame. + """ + Convert an object implementing an array interface to DataFrame. Parameters ---------- - data : numpy/cupy array of ndim 1 or 2, - dimensions greater than 2 are not supported yet. + data : object of ndim 1 or 2, + Object implementing __array_interface__ or __cuda_array_interface__ index : Index or array-like Index to use for resulting frame. Will default to RangeIndex if no indexing information part of input data and @@ -5858,7 +5857,12 @@ def _from_arrays( ------- DataFrame """ - if data.ndim != 1 and data.ndim != 2: + if hasattr(data, "__cuda_array_interface__"): + data = cupy.asfortranarray(data) + elif hasattr(data, "__array_interface__"): + data = np.asfortranarray(data) + + if data.ndim not in {1, 2}: raise ValueError( f"records dimension expected 1 or 2 but found: {data.ndim}" ) From e7c660af47398acde2f1e31057c690e778a73056 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 30 Jul 2024 11:45:18 -0700 Subject: [PATCH 4/8] Use asarray --- python/cudf/cudf/core/column/column.py | 4 ++-- python/cudf/cudf/core/dataframe.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d35ea6d93eb..7e0d8ced595 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1843,7 +1843,7 @@ def as_column( else: mask = None - arbitrary = cupy.ascontiguousarray(arbitrary) + arbitrary = cupy.asarray(arbitrary, order="C") data = as_buffer(arbitrary, exposed=cudf.get_option("copy_on_write")) col = build_column(data, dtype=arbitrary.dtype, mask=mask) @@ -2035,7 +2035,7 @@ def as_column( check_invalid_array(desc["shape"], np.dtype(desc["typestr"])) # CUDF assumes values are always contiguous - arbitrary = np.ascontiguousarray(arbitrary) + arbitrary = np.asarray(arbitrary, order="C") if arbitrary.ndim == 0: # TODO: Or treat as scalar? diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index dafc86d33df..1dce5e8e72c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5858,9 +5858,9 @@ def _from_arrays( DataFrame """ if hasattr(data, "__cuda_array_interface__"): - data = cupy.asfortranarray(data) + data = cupy.asarray(data, order="F") elif hasattr(data, "__array_interface__"): - data = np.asfortranarray(data) + data = np.asarray(data, order="F") if data.ndim not in {1, 2}: raise ValueError( From aff2f89e933adbcec148800e0b58db89548681f7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 30 Jul 2024 11:56:10 -0700 Subject: [PATCH 5/8] Update python/cudf/cudf/core/dataframe.py Co-authored-by: Bradley Dice --- python/cudf/cudf/core/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1dce5e8e72c..07e4c854414 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5845,7 +5845,7 @@ def _from_arrays( Parameters ---------- data : object of ndim 1 or 2, - Object implementing __array_interface__ or __cuda_array_interface__ + Object implementing ``__array_interface__`` or ``__cuda_array_interface__`` index : Index or array-like Index to use for resulting frame. Will default to RangeIndex if no indexing information part of input data and From ac402cf2a6ab4da88c119e68ce7ec90afe51c9c7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 30 Jul 2024 12:41:59 -0700 Subject: [PATCH 6/8] Add benchmark and mypy typing --- python/cudf/benchmarks/API/bench_dataframe.py | 6 +++++ python/cudf/cudf/core/dataframe.py | 23 +++++++++++-------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/python/cudf/benchmarks/API/bench_dataframe.py b/python/cudf/benchmarks/API/bench_dataframe.py index 59d73015962..b404040d019 100644 --- a/python/cudf/benchmarks/API/bench_dataframe.py +++ b/python/cudf/benchmarks/API/bench_dataframe.py @@ -4,6 +4,7 @@ import string +import numba.cuda import numpy import pytest import pytest_cases @@ -16,6 +17,11 @@ def bench_construction(benchmark, N): benchmark(cudf.DataFrame, {None: cupy.random.rand(N)}) +@pytest.mark.parametrize("N", [100, 100_000]) +def bench_construction_numba_device_array(benchmark, N): + benchmark(cudf.DataFrame, numba.cuda.to_device(numpy.ones(N))) + + @benchmark_with_object(cls="dataframe", dtype="float", cols=6) @pytest.mark.parametrize( "expr", ["a+b", "a+b+c+d+e", "a / (sin(a) + cos(b)) * tanh(d*e*f)"] diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 07e4c854414..73b7512f90b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5857,18 +5857,23 @@ def _from_arrays( ------- DataFrame """ + array_data = np.ndarray | cupy.ndarray if hasattr(data, "__cuda_array_interface__"): - data = cupy.asarray(data, order="F") + array_data = cupy.asarray(data, order="F") elif hasattr(data, "__array_interface__"): - data = np.asarray(data, order="F") + array_data = np.asarray(data, order="F") + else: + raise ValueError( + "data must be an object implementing __cuda_array_interface__ or __array_interface__" + ) - if data.ndim not in {1, 2}: + if array_data.ndim not in {1, 2}: raise ValueError( - f"records dimension expected 1 or 2 but found: {data.ndim}" + f"records dimension expected 1 or 2 but found: {array_data.ndim}" ) if data.ndim == 2: - num_cols = data.shape[1] + num_cols = array_data.shape[1] else: # Since we validate ndim to be either 1 or 2 above, # this case can be assumed to be ndim == 1. @@ -5886,14 +5891,14 @@ def _from_arrays( raise ValueError("Duplicate column names are not allowed") names = columns - if data.ndim == 2: + if array_data.ndim == 2: ca_data = { - k: column.as_column(data[:, i], nan_as_null=nan_as_null) + k: column.as_column(array_data[:, i], nan_as_null=nan_as_null) for i, k in enumerate(names) } - elif data.ndim == 1: + elif array_data.ndim == 1: ca_data = { - names[0]: column.as_column(data, nan_as_null=nan_as_null) + names[0]: column.as_column(array_data, nan_as_null=nan_as_null) } if index is not None: From fd4bf2d003c3005df2f146718bd86d0399321dd9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 30 Jul 2024 12:43:13 -0700 Subject: [PATCH 7/8] Make benchmark 2D --- python/cudf/benchmarks/API/bench_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/benchmarks/API/bench_dataframe.py b/python/cudf/benchmarks/API/bench_dataframe.py index b404040d019..8b47a5d3d0c 100644 --- a/python/cudf/benchmarks/API/bench_dataframe.py +++ b/python/cudf/benchmarks/API/bench_dataframe.py @@ -19,7 +19,7 @@ def bench_construction(benchmark, N): @pytest.mark.parametrize("N", [100, 100_000]) def bench_construction_numba_device_array(benchmark, N): - benchmark(cudf.DataFrame, numba.cuda.to_device(numpy.ones(N))) + benchmark(cudf.DataFrame, numba.cuda.to_device(numpy.ones((100, N)))) @benchmark_with_object(cls="dataframe", dtype="float", cols=6) From 6a5864b7a35bd3320765fba6429fb136f2bb7cf0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 30 Jul 2024 19:38:34 -0700 Subject: [PATCH 8/8] Fix typo and mark as pandas_pandas_incompatible --- python/cudf/benchmarks/API/bench_dataframe.py | 1 + python/cudf/cudf/core/dataframe.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/python/cudf/benchmarks/API/bench_dataframe.py b/python/cudf/benchmarks/API/bench_dataframe.py index 8b47a5d3d0c..ba243eb6a7c 100644 --- a/python/cudf/benchmarks/API/bench_dataframe.py +++ b/python/cudf/benchmarks/API/bench_dataframe.py @@ -18,6 +18,7 @@ def bench_construction(benchmark, N): @pytest.mark.parametrize("N", [100, 100_000]) +@pytest.mark.pandas_incompatible def bench_construction_numba_device_array(benchmark, N): benchmark(cudf.DataFrame, numba.cuda.to_device(numpy.ones((100, N)))) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 73b7512f90b..dca0c0b821a 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5857,7 +5857,7 @@ def _from_arrays( ------- DataFrame """ - array_data = np.ndarray | cupy.ndarray + array_data: np.ndarray | cupy.ndarray if hasattr(data, "__cuda_array_interface__"): array_data = cupy.asarray(data, order="F") elif hasattr(data, "__array_interface__"):