ray-project · clarkzinzow · Nov 8, 2022 · Oct 23, 2022 · Nov 2, 2022 · Nov 2, 2022
@@ -142,6 +142,11 @@ HuggingFace
 
 .. autofunction:: ray.data.from_huggingface
 
+TensorFlow
+----------
+
+.. autofunction:: ray.data.from_tf
+
 
 .. _data_source_api:
 
@@ -190,9 +195,6 @@ Built-in Datasources
 .. autoclass:: ray.data.datasource.RangeDatasource
     :members:
 
-.. autoclass:: ray.data.datasource.SimpleTensorFlowDatasource
-    :members:
-
 .. autoclass:: ray.data.datasource.SimpleTorchDatasource
     :members:
 

@@ -499,29 +499,22 @@ From Torch and TensorFlow
 .. tabbed:: TensorFlow
 
     If you already have a TensorFlow dataset available, you can create a Ray Dataset
-    using :py:class:`SimpleTensorFlowDatasource`.
+    using :class:`~ray.data.from_tf`.
 
     .. warning::
-        :py:class:`SimpleTensorFlowDatasource` doesn't support parallel reads. You
-        should only use this datasource for small datasets like MNIST or CIFAR.
+        :class:`~ray.data.from_tf` doesn't support parallel reads. You
+        should only use this function with small datasets like MNIST or CIFAR.
 
     .. code-block:: python
 
-        import ray.data
-        from ray.data.datasource import SimpleTensorFlowDatasource
+        import ray
         import tensorflow_datasets as tfds
 
-        def dataset_factory():
-            return tfds.load("cifar10", split=["train"], as_supervised=True)[0]
+        dataset, _ = tfds.load("cifar10", split=["train", "test"])
+        dataset = ray.data.from_tf(dataset)
 
-        dataset = ray.data.read_datasource(
-            SimpleTensorFlowDatasource(),
-            parallelism=1,
-            dataset_factory=dataset_factory
-        )
-        features, label = dataset.take(1)[0]
-        features.shape  # TensorShape([32, 32, 3])
-        label  # <tf.Tensor: shape=(), dtype=int64, numpy=7>
+        dataset
+        # -> Dataset(num_blocks=200, num_rows=50000, schema={id: binary, image: ArrowTensorType(shape=(32, 32, 3), dtype=uint8), label: int64})
 
 .. _dataset_from_huggingface:
 

@@ -22,6 +22,7 @@
     from_pandas,
     from_pandas_refs,
     from_spark,
+    from_tf,
     range,
     range_arrow,
     range_table,
@@ -64,6 +65,7 @@
     "from_pandas",
     "from_pandas_refs",
     "from_spark",
+    "from_tf",
     "from_huggingface",
     "range",
     "range_table",

@@ -37,7 +37,6 @@
     Partitioning,
 )
 from ray.data.datasource.tfrecords_datasource import TFRecordDatasource
-from ray.data.datasource.tensorflow_datasource import SimpleTensorFlowDatasource
 from ray.data.datasource.text_datasource import TextDatasource
 from ray.data.datasource.torch_datasource import SimpleTorchDatasource
 
@@ -70,7 +69,6 @@
     "RangeDatasource",
     "ReadTask",
     "Reader",
-    "SimpleTensorFlowDatasource",
     "SimpleTorchDatasource",
     "TextDatasource",
     "TFRecordDatasource",

@@ -57,6 +57,7 @@
     import pandas
     import pyarrow
     import pyspark
+    import tensorflow as tf
 
 
 T = TypeVar("T")
@@ -1326,6 +1327,58 @@ def convert(ds: "datasets.Dataset") -> Dataset[ArrowRow]:
         )
 
 
+@PublicAPI
+def from_tf(
-def from_tf(
+def from_tensorflow(
-def from_tf(
+def from_tensorflow(
+    dataset: "tf.data.Dataset",
+) -> Dataset:
+    """Create a dataset from a TensorFlow dataset.
+
+    This function is inefficient. Use it to read small datasets or prototype.
+
+    .. warning::
+        If your dataset is large, this function may execute slowly or raise an
+        out-of-memory error. To avoid issues, read the underyling data with a function
+        like :meth:`~ray.data.read_images`.
+
+    .. note::
+        This function isn't paralellized. It loads the entire dataset into the head
+        node's memory before moving the data to the distributed object store.
+
+    Examples:
+        >>> import ray
+        >>> import tensorflow_datasets as tfds
+        >>> dataset, _ = tfds.load('cifar10', split=["train", "test"])
+        >>> dataset = ray.data.from_tf(dataset)
+        >>> dataset
+        Dataset(num_blocks=200, num_rows=50000, schema={id: binary, image: ArrowTensorType(shape=(32, 32, 3), dtype=uint8), label: int64})
+        >>> dataset.take(1)  # doctest: +SKIP
+        [{'id': b'train_16399', 'image': array([[[143,  96,  70],
+        [141,  96,  72],
+        [135,  93,  72],
+        ...,
+        [ 96,  37,  19],
+        [105,  42,  18],
+        [104,  38,  20]],
+
+       ...,
+
+       [[195, 161, 126],
+        [187, 153, 123],
+        [186, 151, 128],
+        ...,
+        [212, 177, 147],
+        [219, 185, 155],
+        [221, 187, 157]]], dtype=uint8), 'label': 7}]
+
+    Args:
+        dataset: A TensorFlow dataset.
+
+    Returns:
+        A :class:`Dataset` that contains the samples stored in the TensorFlow dataset.
+    """  # noqa: E501
+    return from_items(list(dataset.as_numpy_iterator()))
+
+
 def _df_to_block(df: "pandas.DataFrame") -> Block[ArrowRow]:
     stats = BlockExecStats.builder()
     import pyarrow as pa

@@ -16,7 +16,6 @@
 from ray.data.datasource import (
     Datasource,
     DummyOutputDatasource,
-    SimpleTensorFlowDatasource,
     SimpleTorchDatasource,
     WriteResult,
 )
@@ -192,23 +191,18 @@ def test_write_datasource(ray_start_regular_shared, pipelined):
     assert ray.get(output.data_sink.get_rows_written.remote()) == 10
 
 
-def test_tensorflow_datasource(ray_start_regular_shared):
+def test_from_tf(ray_start_regular_shared):
     import tensorflow as tf
     import tensorflow_datasets as tfds
 
     tf_dataset = tfds.load("mnist", split=["train"], as_supervised=True)[0]
+    tf_dataset = tf_dataset.take(8)  # Use subset to make test run faster.
 
-    def dataset_factory():
-        return tfds.load("mnist", split=["train"], as_supervised=True)[0]
-
-    ray_dataset = ray.data.read_datasource(
-        SimpleTensorFlowDatasource(), parallelism=1, dataset_factory=dataset_factory
-    ).fully_executed()
-
-    assert ray_dataset.num_blocks() == 1
+    ray_dataset = ray.data.from_tf(tf_dataset)
 
     actual_data = ray_dataset.take_all()
     expected_data = list(tf_dataset)
+    assert len(actual_data) == len(expected_data)
     for (expected_features, expected_label), (actual_features, actual_label) in zip(
         expected_data, actual_data
     ):