ray-project · richardliaw · Nov 16, 2022 · Sep 30, 2022 · Sep 30, 2022 · Sep 30, 2022
@@ -658,7 +658,7 @@
     "\n",
     "sample_images = x_test[:3]\n",
     "sample_labels = y_test[:3]\n",
-    "preds = predictor.predict(sample_images).argmax(1)\n",
+    "preds = predictor.predict(sample_images)[\"predictions\"].argmax(1)\n",
     "for image, pred, label in zip(sample_images, preds, sample_labels):\n",
     "    plt.figure(figsize=(2, 2))\n",
     "    plt.title(f\"Prediction = {pred}, Label = {label}\")\n",

@@ -1,6 +1,6 @@
-import pandas as pd
 import numpy as np
 
+import torch
 from torchvision import transforms
 from torchvision.models import resnet18
 
@@ -10,20 +10,20 @@
 from ray.data.preprocessors import BatchMapper
 
 
-def preprocess(batch: np.ndarray) -> pd.DataFrame:
+def preprocess(image_batch: np.ndarray) -> np.ndarray:
     """
-    User Pytorch code to transform user image. Note we still use pandas as
-    intermediate format to hold images as shorthand of python dictionary.
+    User Pytorch code to transform user image with outer dimension of batch size.
     """
     preprocess = transforms.Compose(
         [
-            transforms.ToTensor(),
-            transforms.Resize(256),
+            # Torchvision's ToTensor does not accept outer batch dimension
             transforms.CenterCrop(224),
             transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
         ]
     )
-    return pd.DataFrame({"image": [preprocess(image) for image in batch]})
+    # Outer dimension is batch size such as (10, 256, 256, 3) -> (10, 3, 256, 256)
+    transposed_torch_tensor = torch.Tensor(image_batch.transpose(0, 3, 1, 2))
+    return preprocess(transposed_torch_tensor).numpy()
 
 
 data_url = "s3://anonymous@air-example-data-2/1G-image-data-synthetic-raw"

@@ -517,24 +517,26 @@
    },
    "outputs": [],
    "source": [
-    "from ray.data.preprocessors import BatchMapper\n",
+    "from typing import Dict\n",
+    "import numpy as np\n",
     "\n",
+    "import torch\n",
     "from torchvision import transforms\n",
     "\n",
-    "def preprocess_images(df: pd.DataFrame) -> pd.DataFrame:\n",
-    "    \"\"\"Preprocess images by scaling each channel in the image.\"\"\"\n",
+    "from ray.data.preprocessors import BatchMapper\n",
     "\n",
+    "def preprocess_images(image_batch_dict: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:\n",
+    "    \"\"\"Preprocess images by scaling each channel in the image.\"\"\"\n",
     "    torchvision_transforms = transforms.Compose(\n",
-    "      [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]\n",
+    "      [transforms.Normalize((0.1307,), (0.3081,))]\n",
     "    )\n",
+    "    # Outer dimension is batch size such as (4096, 28, 28)\n",
+    "    image_batch_dict[\"image\"] = torchvision_transforms(\n",
+    "        torch.Tensor(image_batch_dict[\"image\"])\n",
+    "    ).numpy()\n",
+    "    return image_batch_dict\n",
     "\n",
-    "    df = df.copy()\n",
-    "    df.loc[:, \"image\"] = [\n",
-    "        torchvision_transforms(image).numpy() for image in df[\"image\"]\n",
-    "    ]\n",
-    "    return df\n",
-    "\n",
-    "mnist_normalize_preprocessor = BatchMapper(fn=preprocess_images)"
+    "mnist_normalize_preprocessor = BatchMapper(fn=preprocess_images, batch_format=\"numpy\")"
    ]
   },
   {
@@ -1404,7 +1406,7 @@
     "          # Have to specify trainer_resources as 0 so that the example works on Colab. \n",
     "          scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu, trainer_resources={\"CPU\": 0}),\n",
     "          datasets={\"train\": train_dataset},\n",
-    "          preprocessor=BatchMapper(fn=preprocess_images),\n",
+    "          preprocessor=mnist_normalize_preprocessor,\n",
     "          resume_from_checkpoint=latest_checkpoint,\n",
     "      )\n",
     "  result = trainer.fit()\n",
@@ -1715,7 +1717,7 @@
     "            # Have to specify trainer_resources as 0 so that the example works on Colab. \n",
     "            scaling_config=ScalingConfig(num_workers=num_workers, use_gpu=use_gpu, trainer_resources={\"CPU\": 0}),\n",
     "            datasets={\"train\": combined_training_dataset},\n",
-    "            preprocessor=BatchMapper(fn=preprocess_images),\n",
+    "            preprocessor=mnist_normalize_preprocessor,\n",
     "        )\n",
     "result = trainer.fit()\n",
     "full_training_checkpoint = result.checkpoint"
@@ -1841,7 +1843,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.8.13"
   },
   "vscode": {
    "interpreter": {

@@ -20,7 +20,7 @@ Ray AIR Predictors are a class that loads models from `Checkpoint` to perform in
 
 Predictors are used by `BatchPredictor` and `PredictorDeployment` to do large-scale scoring or online inference.
 
-Let's walk through a basic usage of the Predictor. In the below example, we create `Checkpoint` object from a model definition. 
+Let's walk through a basic usage of the Predictor. In the below example, we create `Checkpoint` object from a model definition.
 Checkpoints can be generated from a variety of different ways -- see the :ref:`Checkpoints <air-checkpoints-doc>` user guide for more details.
 
 The checkpoint then is used to create a framework specific Predictor (in our example, a `TensorflowPredictor`), which then can be used for inference:
@@ -46,7 +46,7 @@ Batch Prediction
 
 Ray AIR provides a ``BatchPredictor`` utility for large-scale batch inference.
 
-The BatchPredictor takes in a checkpoint and a predictor class and executes 
+The BatchPredictor takes in a checkpoint and a predictor class and executes
 large-scale batch prediction on a given dataset in a parallel/distributed fashion when calling ``predict()``.
 
 .. note::
@@ -117,10 +117,10 @@ Coming soon!
 Lazy/Pipelined Prediction (experimental)
 ----------------------------------------
 
-If you have a large dataset but not a lot of available memory, you can use the 
+If you have a large dataset but not a lot of available memory, you can use the
 :meth:`predict_pipelined <ray.train.batch_predictor.BatchPredictor.predict_pipelined>` method.
 
-Unlike :py:meth:`predict` which will load the entire data into memory, ``predict_pipelined`` will create a 
+Unlike :py:meth:`predict` which will load the entire data into memory, ``predict_pipelined`` will create a
 :class:`DatasetPipeline` object, which will *lazily* load the data and perform inference on a smaller batch of data at a time.
 
 The lazy loading of the data will allow you to operate on datasets much greater than your available memory.
@@ -145,6 +145,4 @@ To implement a new Predictor for your particular framework, you should subclass
 
 1. ``_predict_pandas``: Given a pandas.DataFrame input, return a pandas.DataFrame containing predictions.
 2. ``from_checkpoint``: Logic for creating a Predictor from an :ref:`AIR Checkpoint <air-checkpoint-ref>`.
-3. Optionally ``_predict_arrow`` for better performance when working with tensor data to avoid extra copies from Pandas conversions.
-
-
+3. Optionally ``_predict_numpy`` for better performance when working with tensor data to avoid extra copies from Pandas conversions.
@@ -22,11 +22,13 @@
 # __train_predict_end__
 
 # __batch_predict_start__
+import pandas as pd
 from ray.train.batch_predictor import BatchPredictor
 
 batch_predictor = BatchPredictor.from_checkpoint(result.checkpoint, XGBoostPredictor)
+predict_dataset = ray.data.from_pandas(pd.DataFrame({"x": np.arange(32)}))
 predictions = batch_predictor.predict(
-    data=ray.data.from_items([{"x": x} for x in range(32)]),
+    data=predict_dataset,
     batch_size=8,
     min_scoring_workers=2,
 )

diff --git a/python/ray/air/data_batch_type.py b/python/ray/air/data_batch_type.py
@@ -3,8 +3,5 @@
 if TYPE_CHECKING:
     import numpy
     import pandas
-    import pyarrow
 
-DataBatchType = Union[
-    "numpy.ndarray", "pandas.DataFrame", "pyarrow.Table", Dict[str, "numpy.ndarray"]
-]
+DataBatchType = Union["numpy.ndarray", "pandas.DataFrame", Dict[str, "numpy.ndarray"]]
diff --git a/python/ray/air/tests/test_data_batch_conversion.py b/python/ray/air/tests/test_data_batch_conversion.py
@@ -11,7 +11,7 @@
     convert_pandas_to_batch_type,
     _convert_batch_type_to_numpy,
 )
-from ray.air.util.data_batch_conversion import DataType
+from ray.air.util.data_batch_conversion import BatchFormat
 from ray.air.util.tensor_extensions.pandas import TensorArray
 from ray.air.util.tensor_extensions.arrow import ArrowTensorArray
 
@@ -22,7 +22,7 @@ def test_pandas_pandas():
     actual_output = convert_batch_type_to_pandas(input_data)
     pd.testing.assert_frame_equal(expected_output, actual_output)
 
-    actual_output = convert_pandas_to_batch_type(actual_output, type=DataType.PANDAS)
+    actual_output = convert_pandas_to_batch_type(actual_output, type=BatchFormat.PANDAS)
     pd.testing.assert_frame_equal(actual_output, input_data)
 
 
@@ -144,7 +144,7 @@ def test_pandas_multi_dim_pandas(cast_tensor_columns, use_tensor_extension_for_i
     pd.testing.assert_frame_equal(expected_output, actual_output)
 
     actual_output = convert_pandas_to_batch_type(
-        actual_output, type=DataType.PANDAS, cast_tensor_columns=cast_tensor_columns
+        actual_output, type=BatchFormat.PANDAS, cast_tensor_columns=cast_tensor_columns
     )
     pd.testing.assert_frame_equal(actual_output, input_data)
 
@@ -157,7 +157,7 @@ def test_numpy_pandas(cast_tensor_columns):
     pd.testing.assert_frame_equal(expected_output, actual_output)
 
     output_array = convert_pandas_to_batch_type(
-        actual_output, type=DataType.NUMPY, cast_tensor_columns=cast_tensor_columns
+        actual_output, type=BatchFormat.NUMPY, cast_tensor_columns=cast_tensor_columns
     )
     np.testing.assert_equal(output_array, input_data)
 
@@ -170,7 +170,7 @@ def test_numpy_multi_dim_pandas(cast_tensor_columns):
     pd.testing.assert_frame_equal(expected_output, actual_output)
 
     output_array = convert_pandas_to_batch_type(
-        actual_output, type=DataType.NUMPY, cast_tensor_columns=cast_tensor_columns
+        actual_output, type=BatchFormat.NUMPY, cast_tensor_columns=cast_tensor_columns
     )
     np.testing.assert_array_equal(np.array(list(output_array)), input_data)
 
@@ -182,7 +182,7 @@ def test_numpy_object_pandas():
     pd.testing.assert_frame_equal(expected_output, actual_output)
 
     np.testing.assert_array_equal(
-        convert_pandas_to_batch_type(actual_output, type=DataType.NUMPY), input_data
+        convert_pandas_to_batch_type(actual_output, type=BatchFormat.NUMPY), input_data
     )
 
 
@@ -211,7 +211,7 @@ def test_dict_pandas(cast_tensor_columns):
     pd.testing.assert_frame_equal(expected_output, actual_output)
 
     output_array = convert_pandas_to_batch_type(
-        actual_output, type=DataType.NUMPY, cast_tensor_columns=cast_tensor_columns
+        actual_output, type=BatchFormat.NUMPY, cast_tensor_columns=cast_tensor_columns
     )
     np.testing.assert_array_equal(output_array, input_data["x"])
 
@@ -225,7 +225,7 @@ def test_dict_multi_dim_to_pandas(cast_tensor_columns):
     pd.testing.assert_frame_equal(expected_output, actual_output)
 
     output_array = convert_pandas_to_batch_type(
-        actual_output, type=DataType.NUMPY, cast_tensor_columns=cast_tensor_columns
+        actual_output, type=BatchFormat.NUMPY, cast_tensor_columns=cast_tensor_columns
     )
     np.testing.assert_array_equal(np.array(list(output_array)), input_data["x"])
 
@@ -238,7 +238,7 @@ def test_dict_pandas_multi_column(cast_tensor_columns):
     pd.testing.assert_frame_equal(expected_output, actual_output)
 
     output_dict = convert_pandas_to_batch_type(
-        actual_output, type=DataType.NUMPY, cast_tensor_columns=cast_tensor_columns
+        actual_output, type=BatchFormat.NUMPY, cast_tensor_columns=cast_tensor_columns
     )
     for k, v in output_dict.items():
         np.testing.assert_array_equal(v, array_dict[k])
@@ -251,7 +251,7 @@ def test_arrow_pandas():
     actual_output = convert_batch_type_to_pandas(input_data)
     pd.testing.assert_frame_equal(expected_output, actual_output)
 
-    assert convert_pandas_to_batch_type(actual_output, type=DataType.ARROW).equals(
+    assert convert_pandas_to_batch_type(actual_output, type=BatchFormat.ARROW).equals(
         input_data
     )
 
@@ -270,7 +270,7 @@ def test_arrow_tensor_pandas(cast_tensor_columns):
     pd.testing.assert_frame_equal(expected_output, actual_output)
 
     arrow_output = convert_pandas_to_batch_type(
-        actual_output, type=DataType.ARROW, cast_tensor_columns=cast_tensor_columns
+        actual_output, type=BatchFormat.ARROW, cast_tensor_columns=cast_tensor_columns
     )
     assert arrow_output.equals(input_data)
 

diff --git a/python/ray/air/util/data_batch_conversion.py b/python/ray/air/util/data_batch_conversion.py
@@ -1,4 +1,4 @@
-from enum import Enum, auto
+from enum import Enum
 from typing import Dict, Union, List
 
 import numpy as np
@@ -16,10 +16,20 @@
 
 
 @DeveloperAPI
-class DataType(Enum):
-    PANDAS = auto()
-    ARROW = auto()
-    NUMPY = auto()  # Either a single numpy array or a Dict of numpy arrays.
+class BatchFormat(str, Enum):
+    PANDAS = "pandas"
+    # TODO: Remove once Arrow is deprecated as user facing batch format
+    ARROW = "arrow"
+    NUMPY = "numpy"  # Either a single numpy array or a Dict of numpy arrays.
+
+
+@DeveloperAPI
+class BlockFormat(str, Enum):
+    """Internal Dataset block format enum."""
+
+    PANDAS = "pandas"
+    ARROW = "arrow"
+    SIMPLE = "simple"
 
 
 @DeveloperAPI
@@ -65,14 +75,14 @@ def convert_batch_type_to_pandas(
 @DeveloperAPI
 def convert_pandas_to_batch_type(
     data: pd.DataFrame,
-    type: DataType,
+    type: BatchFormat,
     cast_tensor_columns: bool = False,
 ) -> DataBatchType:
     """Convert the provided Pandas dataframe to the provided ``type``.
 
     Args:
         data: A Pandas DataFrame
-        type: The specific ``DataBatchType`` to convert to.
+        type: The specific ``BatchFormat`` to convert to.
         cast_tensor_columns: Whether tensor columns should be cast to our tensor
             extension type.
 
@@ -81,10 +91,10 @@ def convert_pandas_to_batch_type(
     """
     if cast_tensor_columns:
         data = _cast_ndarray_columns_to_tensor_extension(data)
-    if type == DataType.PANDAS:
+    if type == BatchFormat.PANDAS:
         return data
 
-    elif type == DataType.NUMPY:
+    elif type == BatchFormat.NUMPY:
         if len(data.columns) == 1:
             # If just a single column, return as a single numpy array.
             return data.iloc[:, 0].to_numpy()
@@ -95,7 +105,7 @@ def convert_pandas_to_batch_type(
                 output_dict[column] = data[column].to_numpy()
             return output_dict
 
-    elif type == DataType.ARROW:
+    elif type == BatchFormat.ARROW:
         if not pyarrow:
             raise ValueError(
                 "Attempted to convert data to Pyarrow Table but Pyarrow "
@@ -106,7 +116,7 @@ def convert_pandas_to_batch_type(
 
     else:
         raise ValueError(
-            f"Received type {type}, but expected it to be one of {DataType}"
+            f"Received type {type}, but expected it to be one of {DataBatchType}"
         )
 
 
@@ -164,7 +174,7 @@ def _convert_batch_type_to_numpy(
                 output_dict[col_name] = col.to_numpy(zero_copy_only=False)
             return output_dict
     elif isinstance(data, pd.DataFrame):
-        return convert_pandas_to_batch_type(data, DataType.NUMPY)
+        return convert_pandas_to_batch_type(data, BatchFormat.NUMPY)
     else:
         raise ValueError(
             f"Received data of type: {type(data)}, but expected it to be one "

@@ -23,7 +23,7 @@ def fast_repartition(blocks, num_blocks):
     )
     # Compute the (n-1) indices needed for an equal split of the data.
     count = wrapped_ds.count()
-    dataset_format = wrapped_ds._dataset_format()
+    dataset_format = wrapped_ds.dataset_format()
     indices = []
     cur_idx = 0
     for _ in range(num_blocks - 1):

@@ -61,7 +61,7 @@
 def _validate_key_fn(ds: "Dataset", key: KeyFn) -> None:
     """Check the key function is valid on the given dataset."""
     try:
-        fmt = ds._dataset_format()
+        fmt = ds.dataset_format()
     except ValueError:
         # Dataset is empty/cleared, validation not possible.
         return