ray-project · jianoaix · Apr 6, 2023 · Mar 22, 2023 · Mar 23, 2023 · Mar 24, 2023
@@ -30,7 +30,7 @@ def _to_block_iterator(
         ds = self._base_dataset
         block_iterator, stats, executor = ds._plan.execute_to_iterator()
         ds._current_executor = executor
-        return block_iterator, stats
+        return block_iterator, stats, False
 
     def stats(self) -> str:
         return self._base_dataset.stats()

@@ -35,11 +35,20 @@ def _to_block_iterator(
     ]:
         epoch_pipeline = self._get_next_dataset()
 
+        if epoch_pipeline._first_dataset is not None:
+            blocks_owned_by_consumer = (
+                epoch_pipeline._first_dataset._plan.execute()._owned_by_consumer
+            )
+        else:
+            blocks_owned_by_consumer = (
+                epoch_pipeline._peek()._plan.execute()._owned_by_consumer
+            )
+
         def block_iter():
             for ds in epoch_pipeline.iter_datasets():
                 yield from ds._plan.execute().iter_blocks_with_metadata()
 
-        return block_iter(), None
+        return block_iter(), None, blocks_owned_by_consumer
 
     def iter_batches(
         self,

@@ -100,7 +100,7 @@ def gen_blocks() -> Iterator[Tuple[ObjectRef[Block], BlockMetadata]]:
                     )
                     yield block_ref
 
-        return gen_blocks(), None
+        return gen_blocks(), None, False
 
     def stats(self) -> str:
         """Implements DatasetIterator."""

@@ -153,7 +153,7 @@ def iter_batches(
 
         time_start = time.perf_counter()
 
-        block_iterator, stats = self._to_block_iterator()
+        block_iterator, stats, blocks_owned_by_consumer = self._to_block_iterator()
         if use_legacy:
             # Legacy iter_batches does not use metadata.
             def drop_metadata(block_iterator):
@@ -164,6 +164,7 @@ def drop_metadata(block_iterator):
                 drop_metadata(block_iterator),
                 stats=stats,
                 prefetch_blocks=prefetch_blocks,
+                clear_block_after_read=blocks_owned_by_consumer,
                 batch_size=batch_size,
                 batch_format=batch_format,
                 drop_last=drop_last,
@@ -175,6 +176,7 @@ def drop_metadata(block_iterator):
             yield from iter_batches(
                 block_iterator,
                 stats=stats,
+                clear_block_after_read=blocks_owned_by_consumer,
                 batch_size=batch_size,
                 batch_format=batch_format,
                 drop_last=drop_last,

@@ -393,6 +393,12 @@ def test_iter_batches_basic(ray_start_regular_shared):
     assert all(len(e) == 1 for e in batches)
 
 
+def test_to_torch(ray_start_regular_shared):
+    pipe = ray.data.range(10, parallelism=10).window(blocks_per_window=2)
+    batches = list(pipe.to_torch(batch_size=None))
+    assert len(batches) == 10
+
+
 def test_iter_batches_batch_across_windows(ray_start_regular_shared):
     # 3 windows, each containing 3 blocks, each containing 3 rows.
     pipe = ray.data.range(27, parallelism=9).window(blocks_per_window=3)

@@ -16,6 +16,17 @@ def check_no_spill(ctx, pipe):
     assert "Spilled" not in meminfo, meminfo
 
 
+def check_to_torch_no_spill(ctx, pipe):
+    # Run up to 10 epochs of the pipeline to stress test that
+    # no spilling will happen.
+    max_epoch = 10
+    for p in pipe.iter_epochs(max_epoch):
+        for _ in p.to_torch(batch_size=None):
+            pass
+    meminfo = memory_summary(ctx.address_info["address"], stats_only=True)
+    assert "Spilled" not in meminfo, meminfo
+
+
 def test_iter_batches_no_spilling_upon_no_transformation(shutdown_only):
     # The object store is about 300MB.
     ctx = ray.init(num_cpus=1, object_store_memory=300e6)
@@ -24,6 +35,8 @@ def test_iter_batches_no_spilling_upon_no_transformation(shutdown_only):
 
     check_no_spill(ctx, ds.repeat())
     check_no_spill(ctx, ds.window(blocks_per_window=20))
+    check_to_torch_no_spill(ctx, ds.repeat())
+    check_to_torch_no_spill(ctx, ds.window(blocks_per_window=20))
 
 
 def test_iter_batches_no_spilling_upon_rewindow(shutdown_only):
@@ -35,6 +48,9 @@ def test_iter_batches_no_spilling_upon_rewindow(shutdown_only):
     check_no_spill(
         ctx, ds.window(blocks_per_window=20).repeat().rewindow(blocks_per_window=10)
     )
+    check_to_torch_no_spill(
+        ctx, ds.window(blocks_per_window=20).repeat().rewindow(blocks_per_window=10)
+    )
 
 
 def test_iter_batches_no_spilling_upon_prior_transformation(shutdown_only):