ray-project · pcmoritz · Apr 12, 2023 · Apr 11, 2023 · Apr 11, 2023 · Apr 11, 2023
diff --git a/doc/source/data/api/dataset.rst b/doc/source/data/api/dataset.rst
@@ -149,3 +149,17 @@ Serialization
    Dataset.has_serializable_lineage
    Dataset.serialize_lineage
    Dataset.deserialize_lineage
+
+
+Internals
+---------
+
+.. autosummary::
+    :toctree: doc/
+
+    Dataset.__init__
+    Dataset.dataset_format
+    Dataset.fully_executed
+    Dataset.is_fully_executed
+    Dataset.lazy
+    Dataset.write_webdataset
@@ -52,7 +52,7 @@ transform datasets. Ray executes transformations in parallel for performance at
 
     import pandas as pd
 
-    # Find rows with spepal length < 5.5 and petal length > 3.5.
+    # Find rows with sepal length < 5.5 and petal length > 3.5.
     def transform_batch(df: pd.DataFrame) -> pd.DataFrame:
         return df[(df["sepal length (cm)"] < 5.5) & (df["petal length (cm)"] > 3.5)]
 
@@ -62,8 +62,8 @@ transform datasets. Ray executes transformations in parallel for performance at
 .. testoutput::
 
     MapBatches(transform_batch)
-    +- Dataset(
-          num_blocks=...,
+    +- Datastream(
+          num_blocks=1,
           num_rows=150,
           schema={
              sepal length (cm): double,
@@ -74,6 +74,7 @@ transform datasets. Ray executes transformations in parallel for performance at
           }
        )
 
+
 To learn more about transforming datasets, read
 :ref:`Transforming datasets <transforming_datasets>`.
 

@@ -107,7 +107,7 @@ Ray Datasets Glossary
 
             >>> import ray
             >>> ray.data.from_items(["spam", "ham", "eggs"])
-            Dataset(num_blocks=3, num_rows=3, schema=<class 'str'>)
+            MaterializedDatastream(num_blocks=3, num_rows=3, schema=<class 'str'>)
 
     Tensor Dataset
         A Dataset that represents a collection of ndarrays.
@@ -119,7 +119,7 @@ Ray Datasets Glossary
             >>> import numpy as np
             >>> import ray
             >>> ray.data.from_numpy(np.zeros((100, 32, 32, 3)))
-            Dataset(
+            MaterializedDatastream(
                num_blocks=1,
                num_rows=100,
                schema={__value__: ArrowTensorType(shape=(32, 32, 3), dtype=double)}
@@ -132,7 +132,7 @@ Ray Datasets Glossary
 
             >>> import ray
             >>> ray.data.read_csv("s3://anonymous@air-example-data/iris.csv")
-            Dataset(
+            Datastream(
                num_blocks=1,
                num_rows=150,
                schema={

diff --git a/doc/source/rllib/package_ref/rl_modules.rst b/doc/source/rllib/package_ref/rl_modules.rst
@@ -114,7 +114,7 @@ Constructor
     :toctree: doc/
 
     MultiAgentRLModule
-    MultiAgentRLModule.build
+    MultiAgentRLModule.setup()
     MultiAgentRLModule.as_multi_agent
 
 Modifying the underlying RL modules

diff --git a/python/ray/data/_internal/execution/interfaces.py b/python/ray/data/_internal/execution/interfaces.py
@@ -191,22 +191,6 @@ class ExecutionOptions:
     """Common options for execution.
 
     Some options may not be supported on all executors (e.g., resource limits).
-
-    Attributes:
-        resource_limits: Set a soft limit on the resource usage during execution.
-            This is not supported in bulk execution mode. Autodetected by default.
-        locality_with_output: Set this to prefer running tasks on the same node as the
-            output node (node driving the execution). It can also be set to a list of
-            node ids to spread the outputs across those nodes. Off by default.
-        preserve_order: Set this to preserve the ordering between blocks processed by
-            operators under the streaming executor. The bulk executor always preserves
-            order. Off by default.
-        actor_locality_enabled: Whether to enable locality-aware task dispatch to
-            actors (on by default). This applies to both ActorPoolStrategy map and
-            streaming_split operations.
-        verbose_progress: Whether to report progress individually per operator. By
-            default, only AllToAll operators and global progress is reported. This
-            option is useful for performance debugging. Off by default.
     """
 
     resource_limits: ExecutionResources = ExecutionResources()

@@ -440,8 +440,12 @@ def map_batches(
             ...     "age": [4, 14, 9]
             ... })
             >>> ds = ray.data.from_pandas(df)
-            >>> ds
-            Datastream(num_blocks=1, num_rows=3, schema={name: object, age: int64})
+            >>> ds  # doctest: +SKIP
+            MaterializedDatastream(
+                num_blocks=1,
+                num_rows=3,
+                schema={name: object, age: int64}
+            )
 
             Call :meth:`.default_batch_format` to determine the default batch
             type.

@@ -49,9 +49,9 @@ class DatasetIterator(abc.ABC):
         >>> import ray
         >>> ds = ray.data.range(5)
         >>> ds
-        Dataset(num_blocks=5, num_rows=5, schema=<class 'int'>)
+        Datastream(num_blocks=5, num_rows=5, schema=<class 'int'>)
         >>> ds.iterator()
-        DatasetIterator(Dataset(num_blocks=5, num_rows=5, schema=<class 'int'>))
+        DatasetIterator(Datastream(num_blocks=5, num_rows=5, schema=<class 'int'>))
         >>> ds = ds.repeat(); ds
         DatasetPipeline(num_windows=inf, num_stages=2)
         >>> ds.iterator()
@@ -641,7 +641,7 @@ def to_tf(
             ...     "s3://anonymous@air-example-data/iris.csv"
             ... )
             >>> it = ds.iterator(); it
-            DatasetIterator(Dataset(
+            DatasetIterator(Datastream(
                num_blocks=1,
                num_rows=150,
                schema={
@@ -672,7 +672,7 @@ def to_tf(
             >>> it = preprocessor.transform(ds).iterator()
             >>> it
             DatasetIterator(Concatenator
-            +- Dataset(
+            +- Datastream(
                   num_blocks=1,
                   num_rows=150,
                   schema={

@@ -227,7 +227,7 @@ def train_loop_per_worker():
             best_checkpoint_loss = result.metrics['loss']
 
             # Assert loss is less 0.09
-            assert best_checkpoint_loss <= 0.09
+            assert best_checkpoint_loss <= 0.09   # doctest: +SKIP
 
     .. testoutput::
         :hide: