[AIR - Datasets] Add experimental read_images (ray-project#29177)

Users can't discover ImageFolderDatasource. This PR adds a more-discoverable way to read images. Signed-off-by: Balaji Veeramani <[email protected]> Co-authored-by: Balaji Veeramani <[email protected]> Signed-off-by: Weichen Xu <[email protected]>
WeichenXu123 · Dec 19, 2022 · 0c861fd · 0c861fd
1 parent cae6dc7
commit 0c861fd
Show file tree

Hide file tree

Showing 33 changed files with 361 additions and 388 deletions.
diff --git a/doc/source/data/api/input_output.rst b/doc/source/data/api/input_output.rst
@@ -48,6 +48,11 @@ Text
 
 .. autofunction:: ray.data.read_text
 
+Images (experimental)
+---------------------
+
+.. autofunction:: ray.data.read_images
+
 Binary
 ------
 
@@ -167,7 +172,7 @@ Built-in Datasources
 .. autoclass:: ray.data.datasource.FileBasedDatasource
     :members:
 
-.. autoclass:: ray.data.datasource.ImageFolderDatasource
+.. autoclass:: ray.data.datasource.ImageDatasource
     :members:
 
 .. autoclass:: ray.data.datasource.JSONDatasource

diff --git a/doc/source/data/creating-datasets.rst b/doc/source/data/creating-datasets.rst
@@ -162,6 +162,22 @@ Supported File Formats
 
   See the API docs for :func:`read_text() <ray.data.read_text>`.
 
+.. tabbed:: Images (experimental)
+
+  Call :func:`~ray.data.read_images` to read images into a :class:`~ray.data.Dataset`. 
+
+  This function stores image data in single-column
+  `Arrow Table <https://arrow.apache.org/docs/python/generated/pyarrow.Table.html>`__
+  blocks using the 
+  :class:`tensor extension type <ray.data.extensions.tensor_extension.ArrowTensorType>`.
+  For more information on working with tensors in Datasets, read the 
+  :ref:`tensor data guide <datasets_tensor_support>`.
+
+  .. literalinclude:: ./doc_code/creating_datasets.py
+    :language: python
+    :start-after: __read_images_begin__
+    :end-before: __read_images_end__
+
 .. tabbed:: Binary
 
   Read binary files into a ``Dataset``. Each binary file will be treated as a single row
@@ -533,19 +549,6 @@ converts it into a Ray Dataset directly.
     ray_datasets["train"].take(2)
     # [{'text': ''}, {'text': ' = Valkyria Chronicles III = \n'}]
 
-.. _datasets_from_images:
-
--------------------------------
-From Image Files (experimental)
--------------------------------
-
-Load image data stored as individual files using :py:class:`~ray.data.datasource.ImageFolderDatasource`:
-
-.. literalinclude:: ./doc_code/tensor.py
-    :language: python
-    :start-after: __create_images_begin__
-    :end-before: __create_images_end__
-
 .. _datasets_custom_datasource:
 
 ------------------

diff --git a/doc/source/data/dataset-tensor-support.rst b/doc/source/data/dataset-tensor-support.rst
@@ -95,7 +95,7 @@ This section shows how to create single and multi-column tensor datasets.
 
 .. tabbed:: Images (experimental)
 
-  Load image data stored as individual files using :class:`~ray.data.datasource.ImageFolderDatasource`:
+  Load image data stored as individual files using :func:`~ray.data.read_images`:
 
   **Image and label columns**:
 

diff --git a/doc/source/data/dataset.rst b/doc/source/data/dataset.rst
@@ -201,6 +201,9 @@ Supported Input Formats
    * - Text Files
      - :func:`ray.data.read_text()`
      - ✅
+   * - Image Files (experimental)
+     - :func:`ray.data.read_images()`
+     - 🚧
    * - Binary Files
      - :func:`ray.data.read_binary_files()`
      - ✅

diff --git a/doc/source/data/doc_code/creating_datasets.py b/doc/source/data/doc_code/creating_datasets.py
@@ -149,6 +149,23 @@
 # __from_numpy_end__
 # fmt: on
 
+# fmt: off
+# __read_images_begin__
+ds = ray.data.read_images("example://image-datasets/simple")
+# -> Dataset(num_blocks=3, num_rows=3, 
+#            schema={__value__: ArrowTensorType(shape=(32, 32, 3), dtype=uint8)})
+
+ds.take(1)
+# -> [array([[[ 88,  70,  68],
+#            [103,  88,  85],
+#            [112,  96,  97],
+#            ...,
+#            [168, 151,  81],
+#            [167, 149,  83],
+#            [166, 148,  82]]], dtype=uint8)]
+# __read_images_end__
+# fmt: on
+
 # fmt: off
 # __from_numpy_mult_begin__
 import numpy as np

diff --git a/doc/source/data/doc_code/tensor.py b/doc/source/data/doc_code/tensor.py
@@ -194,31 +194,18 @@ def cast_udf(block: pa.Table) -> pa.Table:
 ds.fully_executed()
 
 # __create_images_begin__
-from ray.data.datasource import ImageFolderDatasource
-
-ds = ray.data.read_datasource(
-    ImageFolderDatasource(), root="example://image-folders/simple", size=(128, 128))
-# -> Dataset(num_blocks=3, num_rows=3,
-#            schema={image: TensorDtype(shape=(128, 128, 3), dtype=uint8),
-#                    label: object})
+ds = ray.data.read_images("example://image-datasets/simple")
+# -> Dataset(num_blocks=3, num_rows=3, 
+#            schema={__value__: ArrowTensorType(shape=(32, 32, 3), dtype=uint8)})
 
 ds.take(1)
-# -> [{'image':
-#         array([[[ 92,  71,  57],
-#                 [107,  87,  72],
-#                 ...,
-#                 [141, 161, 185],
-#                 [139, 158, 184]],
-#
-#                ...,
-#
-#                [[135, 135, 109],
-#                 [135, 135, 108],
-#                 ...,
-#                 [167, 150,  89],
-#                 [165, 146,  90]]], dtype=uint8),
-#      'label': 'cat',
-#     }]
+# -> [array([[[ 88,  70,  68],
+#            [103,  88,  85],
+#            [112,  96,  97],
+#            ...,
+#            [168, 151,  81],
+#            [167, 149,  83],
+#            [166, 148,  82]]], dtype=uint8)]
 # __create_images_end__
 
 

diff --git a/doc/source/ray-air/examples/torch_image_batch_pretrained.py b/doc/source/ray-air/examples/torch_image_batch_pretrained.py
@@ -1,4 +1,5 @@
 import pandas as pd
+import numpy as np
 
 from torchvision import transforms
 from torchvision.models import resnet18
@@ -7,10 +8,9 @@
 from ray.train.torch import TorchCheckpoint, TorchPredictor
 from ray.train.batch_predictor import BatchPredictor
 from ray.data.preprocessors import BatchMapper
-from ray.data.datasource import ImageFolderDatasource
 
 
-def preprocess(df: pd.DataFrame) -> pd.DataFrame:
+def preprocess(batch: np.ndarray) -> pd.DataFrame:
     """
     User Pytorch code to transform user image. Note we still use pandas as
     intermediate format to hold images as shorthand of python dictionary.
@@ -23,20 +23,17 @@ def preprocess(df: pd.DataFrame) -> pd.DataFrame:
             transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
         ]
     )
-    df.loc[:, "image"] = [preprocess(x).numpy() for x in df["image"]]
-    return df
+    return pd.DataFrame({"image": [preprocess(image) for image in batch]})
 
 
 data_url = "s3://anonymous@air-example-data-2/1G-image-data-synthetic-raw"
 print(f"Running GPU batch prediction with 1GB data from {data_url}")
-dataset = ray.data.read_datasource(
-    ImageFolderDatasource(), root=data_url, size=(256, 256)
-)
+dataset = ray.data.read_images(data_url, size=(256, 256)).limit(10)
 
 model = resnet18(pretrained=True)
 
-preprocessor = BatchMapper(preprocess)
+preprocessor = BatchMapper(preprocess, batch_format="numpy")
 ckpt = TorchCheckpoint.from_model(model=model, preprocessor=preprocessor)
 
 predictor = BatchPredictor.from_checkpoint(ckpt, TorchPredictor)
-predictor.predict(dataset, feature_columns=["image"], batch_size=80)
+predictor.predict(dataset, batch_size=80)
diff --git a/python/ray/data/__init__.py b/python/ray/data/__init__.py
@@ -29,6 +29,7 @@
     read_binary_files,
     read_csv,
     read_datasource,
+    read_images,
     read_json,
     read_numpy,
     read_parquet,
@@ -71,6 +72,7 @@
     "read_binary_files",
     "read_csv",
     "read_datasource",
+    "read_images",
     "read_json",
     "read_numpy",
     "read_parquet",

diff --git a/python/ray/data/datasource/__init__.py b/python/ray/data/datasource/__init__.py
@@ -24,7 +24,7 @@
     FileMetadataProvider,
     ParquetMetadataProvider,
 )
-from ray.data.datasource.image_folder_datasource import ImageFolderDatasource
+from ray.data.datasource.image_datasource import ImageDatasource
 from ray.data.datasource.json_datasource import JSONDatasource
 from ray.data.datasource.numpy_datasource import NumpyDatasource
 from ray.data.datasource.parquet_base_datasource import ParquetBaseDatasource
@@ -55,7 +55,7 @@
     "FileBasedDatasource",
     "FileExtensionFilter",
     "FileMetadataProvider",
-    "ImageFolderDatasource",
+    "ImageDatasource",
     "JSONDatasource",
     "NumpyDatasource",
     "ParquetBaseDatasource",