From 36aebcbe1a5bba676f16b49c0184e091af07e73c Mon Sep 17 00:00:00 2001 From: Clark Zinzow Date: Sun, 27 Nov 2022 12:53:30 -0800 Subject: [PATCH] [Datasets] Fix ndarray representation of single-element ragged tensor slices. (#30514) Single-element ragged tensor slices (e.g. a[5:6]) currently have the wrong NumPy representation; namely, although they are object-dtyped, they have a multi-dimensional shape, and its single tensor element isn't well-typed (in other words, it doesn't use the pointer-to-subdarrays representation). This is due to np.array([subndarray], dtype=object) trying to create a more consolidated representation than np.array([subndarray1, subndarray2], dtype=object). This causes single-element batches of ragged tensor slices failing to eventually be put back into the tensor extension representation. This PR fixes this by doing a very explicit ragged tensor construction via the create-and-fill method: we allocate an empty, object-dtyped 1D array and fill it with the tensor elements. This prevents NumPy from trying to optimize the ragged tensor representation. --- python/ray/air/tests/test_tensor_extension.py | 9 +++++++- .../ray/air/util/tensor_extensions/arrow.py | 12 ++++++---- .../ray/air/util/tensor_extensions/pandas.py | 16 ++++++-------- .../ray/air/util/tensor_extensions/utils.py | 22 +++++++++++++++++++ 4 files changed, 45 insertions(+), 14 deletions(-) diff --git a/python/ray/air/tests/test_tensor_extension.py b/python/ray/air/tests/test_tensor_extension.py index a04c1ac3359e..811116a82105 100644 --- a/python/ray/air/tests/test_tensor_extension.py +++ b/python/ray/air/tests/test_tensor_extension.py @@ -178,7 +178,14 @@ def test_arrow_variable_shaped_tensor_array_slice(): slice(0, 3), ] for slice_ in slices: - for o, e in zip(ata[slice_], arr[slice_]): + ata_slice = ata[slice_] + ata_slice_np = ata_slice.to_numpy() + arr_slice = arr[slice_] + # Check for equivalent dtypes and shapes. + assert ata_slice_np.dtype == arr_slice.dtype + assert ata_slice_np.shape == arr_slice.shape + # Iteration over tensor array slices triggers NumPy conversion. + for o, e in zip(ata_slice, arr_slice): np.testing.assert_array_equal(o, e) diff --git a/python/ray/air/util/tensor_extensions/arrow.py b/python/ray/air/util/tensor_extensions/arrow.py index a6f8282b66cb..91b5daecbefe 100644 --- a/python/ray/air/util/tensor_extensions/arrow.py +++ b/python/ray/air/util/tensor_extensions/arrow.py @@ -5,7 +5,10 @@ import numpy as np import pyarrow as pa -from ray.air.util.tensor_extensions.utils import _is_ndarray_variable_shaped_tensor +from ray.air.util.tensor_extensions.utils import ( + _is_ndarray_variable_shaped_tensor, + _create_strict_ragged_ndarray, +) from ray._private.utils import _get_pyarrow_version from ray.util.annotations import PublicAPI @@ -664,10 +667,11 @@ def from_numpy( np_data_buffer = np.concatenate(raveled) dtype = np_data_buffer.dtype if dtype.type is np.object_: + types_and_shapes = [(f"dtype={a.dtype}", f"shape={a.shape}") for a in arr] raise ValueError( "ArrowVariableShapedTensorArray only supports heterogeneous-shaped " - "tensor collections, not arbitrarily nested ragged tensors. Got: " - f"{arr}" + "tensor collections, not arbitrarily nested ragged tensors. Got " + f"arrays: {types_and_shapes}" ) pa_dtype = pa.from_numpy_dtype(dtype) if dtype.type is np.bool_: @@ -720,7 +724,7 @@ def _to_numpy(self, index: Optional[int] = None, zero_copy_only: bool = False): arrs = [self._to_numpy(i, zero_copy_only) for i in range(len(self))] # Return ragged NumPy ndarray in the ndarray of ndarray pointers # representation. - return np.array(arrs, dtype=object) + return _create_strict_ragged_ndarray(arrs) data = self.storage.field("data") shapes = self.storage.field("shape") value_type = data.type.value_type diff --git a/python/ray/air/util/tensor_extensions/pandas.py b/python/ray/air/util/tensor_extensions/pandas.py index e23cce7b9198..4201be97201f 100644 --- a/python/ray/air/util/tensor_extensions/pandas.py +++ b/python/ray/air/util/tensor_extensions/pandas.py @@ -44,7 +44,10 @@ from pandas.core.indexers import check_array_indexer, validate_indices from pandas.io.formats.format import ExtensionArrayFormatter -from ray.air.util.tensor_extensions.utils import _is_ndarray_variable_shaped_tensor +from ray.air.util.tensor_extensions.utils import ( + _is_ndarray_variable_shaped_tensor, + _create_strict_ragged_ndarray, +) from ray.util.annotations import PublicAPI try: @@ -1422,9 +1425,7 @@ def _is_boolean(self): def _create_possibly_ragged_ndarray( - values: Union[ - np.ndarray, ABCSeries, Sequence[Union[np.ndarray, TensorArrayElement]] - ] + values: Union[np.ndarray, ABCSeries, Sequence[np.ndarray]] ) -> np.ndarray: """ Create a possibly ragged ndarray. @@ -1438,11 +1439,8 @@ def _create_possibly_ragged_ndarray( return np.array(values, copy=False) except ValueError as e: if "could not broadcast input array from shape" in str(e): - # Create an empty object-dtyped 1D array. - arr = np.empty(len(values), dtype=object) - # Try to fill the 1D array of pointers with the (ragged) tensors. - arr[:] = list(values) - return arr + # Fall back to strictly creating a ragged ndarray. + return _create_strict_ragged_ndarray(values) else: # Re-raise original error if the failure wasn't a broadcast error. raise e from None diff --git a/python/ray/air/util/tensor_extensions/utils.py b/python/ray/air/util/tensor_extensions/utils.py index 3b7e60a579fb..f28928b54c2c 100644 --- a/python/ray/air/util/tensor_extensions/utils.py +++ b/python/ray/air/util/tensor_extensions/utils.py @@ -1,3 +1,5 @@ +from typing import Any + import numpy as np @@ -20,3 +22,23 @@ def _is_ndarray_variable_shaped_tensor(arr: np.ndarray) -> bool: if a.shape != shape: return True return True + + +def _create_strict_ragged_ndarray(values: Any) -> np.ndarray: + """Create a ragged ndarray; the representation will be ragged (1D array of + subndarray pointers) even if it's possible to represent it as a non-ragged ndarray. + """ + # Use the create-empty-and-fill method. This avoids the following pitfalls of the + # np.array constructor - np.array(values, dtype=object): + # 1. It will fail to construct an ndarray if the first element dimension is + # uniform, e.g. for imagery whose first element dimension is the channel. + # 2. It will construct the wrong representation for a single-row column (i.e. unit + # outer dimension). Namely, it will consolidate it into a single multi-dimensional + # ndarray rather than a 1D array of subndarray pointers, resulting in the single + # row not being well-typed (having object dtype). + + # Create an empty object-dtyped 1D array. + arr = np.empty(len(values), dtype=object) + # Try to fill the 1D array of pointers with the (ragged) tensors. + arr[:] = list(values) + return arr