-
Notifications
You must be signed in to change notification settings - Fork 5.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[data] randomize_block_order() not compatible with stage fusion #26090
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -329,6 +329,8 @@ def _optimize(self) -> Tuple[BlockList, DatasetStats, List[Stage]]: | |
""" | ||
context = DatasetContext.get_current() | ||
blocks, stats, stages = self._get_source_blocks_and_stages() | ||
if context.optimize_reorder_stages: | ||
stages = _reorder_stages(stages) | ||
if context.optimize_fuse_stages: | ||
if context.optimize_fuse_read_stages: | ||
# If using a lazy datasource, rewrite read stage into one-to-one stage | ||
|
@@ -692,6 +694,18 @@ def __call__( | |
return blocks, stage_info | ||
|
||
|
||
class RandomizeBlocksStage(AllToAllStage): | ||
def __init__(self, seed: Optional[int]): | ||
def do_randomize_block_order(block_list, *_): | ||
num_blocks = block_list.executed_num_blocks() # Blocking. | ||
if num_blocks == 0: | ||
return block_list, {} | ||
randomized_block_list = block_list.randomize_block_order(seed) | ||
return randomized_block_list, {} | ||
|
||
super().__init__("randomize_block_order", None, do_randomize_block_order) | ||
|
||
|
||
def _rewrite_read_stages( | ||
blocks: BlockList, | ||
stats: DatasetStats, | ||
|
@@ -741,6 +755,36 @@ def block_fn(read_fn: Callable[[], Iterator[Block]]) -> Iterator[Block]: | |
return block_list, stats, stage | ||
|
||
|
||
def _reorder_stages(stages: List[Stage]) -> List[Stage]: | ||
"""Reorder randomize stages to the end to enable better stage fusion. | ||
|
||
This applies to RandomizeBlockOrder stages specifically (issue #26057). | ||
|
||
Args: | ||
stages: Stages to try to reorder. | ||
|
||
Returns: | ||
Reordered stages. | ||
""" | ||
|
||
output: List[Stage] = [] | ||
reorder_buf: List[RandomizeBlocksStage] = [] | ||
|
||
for s in stages: | ||
if isinstance(s, RandomizeBlocksStage): | ||
# Buffer it for later reordering. | ||
reorder_buf.append(s) | ||
else: | ||
# Barrier: flush the reorder buffer. | ||
if isinstance(s, AllToAllStage): | ||
output.extend(reorder_buf) | ||
reorder_buf = [] | ||
output.append(s) | ||
|
||
output.extend(reorder_buf) | ||
return output | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice! This is a good one-off. |
||
|
||
|
||
def _fuse_one_to_one_stages(stages: List[Stage]) -> List[Stage]: | ||
"""Fuses compatible one-to-one stages. | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,7 +35,12 @@ | |
from ray.data._internal.fast_repartition import fast_repartition | ||
from ray.data._internal.lazy_block_list import LazyBlockList | ||
from ray.data._internal.output_buffer import BlockOutputBuffer | ||
from ray.data._internal.plan import AllToAllStage, ExecutionPlan, OneToOneStage | ||
from ray.data._internal.plan import ( | ||
AllToAllStage, | ||
ExecutionPlan, | ||
OneToOneStage, | ||
RandomizeBlocksStage, | ||
) | ||
from ray.data._internal.progress_bar import ProgressBar | ||
from ray.data._internal.remote_fn import cached_remote_fn | ||
from ray.data._internal.shuffle_and_partition import ( | ||
|
@@ -169,6 +174,8 @@ def __init__( | |
plan: ExecutionPlan, | ||
epoch: int, | ||
lazy: bool, | ||
*, | ||
defer_execution: bool = False, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I had hard time understand why there's a defer_execution when there is already a lazy parameter? Seems to muddy the execution semantics even more. |
||
): | ||
"""Construct a Dataset (internal API). | ||
|
||
|
@@ -183,7 +190,7 @@ def __init__( | |
self._epoch = epoch | ||
self._lazy = lazy | ||
|
||
if not lazy: | ||
if not lazy and not defer_execution: | ||
self._plan.execute(allow_clear_input_blocks=False) | ||
|
||
@staticmethod | ||
|
@@ -808,26 +815,11 @@ def randomize_block_order( | |
based on system randomness. | ||
|
||
Returns: | ||
The shuffled dataset. | ||
The block-shuffled dataset. | ||
""" | ||
|
||
def do_randomize_block_order(block_list, *_): | ||
num_blocks = block_list.executed_num_blocks() # Blocking. | ||
if num_blocks == 0: | ||
return block_list, {} | ||
|
||
randomized_block_list = block_list.randomize_block_order(seed) | ||
|
||
return randomized_block_list, {} | ||
|
||
plan = self._plan.with_stage( | ||
AllToAllStage( | ||
"randomize_block_order", | ||
None, | ||
do_randomize_block_order, | ||
) | ||
) | ||
return Dataset(plan, self._epoch, self._lazy) | ||
plan = self._plan.with_stage(RandomizeBlocksStage(seed)) | ||
return Dataset(plan, self._epoch, self._lazy, defer_execution=True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm having both There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point. I can't remember why I did this... removed it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, it breaks lazy read->map_batches() fusion, that's why. I reverted this since it broke a unit test. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm do you know why it breaks that? |
||
|
||
def random_sample( | ||
self, fraction: float, *, seed: Optional[int] = None | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@ericl If you rewrite the read stage before you reorder stages, won't the read->map_batches fusion work without the
lazy
vsdefer_execution
hack?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, because if you call .randomize_blocks() eagerly it will materialize the read blocks, so it's too late. We have to force it to be lazy as a special case.
I also need this for auto_repartition() anyway, so I think this is a sensible thing... and if we move to lazy by default it will go away.