From afeaa9afba41bd3b8d688ac3da2a25c5834dd4af Mon Sep 17 00:00:00 2001 From: janEbert Date: Wed, 4 Sep 2024 21:18:22 +0200 Subject: [PATCH] Do not consume unnecessary memory during sharding There is no need to create a temporary list of a potentially very large step/world size. --- src/datasets/iterable_dataset.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py index 5f5c49f1556..57b98575c1f 100644 --- a/src/datasets/iterable_dataset.py +++ b/src/datasets/iterable_dataset.py @@ -556,12 +556,7 @@ def _init_state_dict(self) -> dict: def __iter__(self): ex_iterator = iter(self.ex_iterable) - while True: - batch = list(islice(ex_iterator, self.step)) - if len(batch) > self.offset: - yield batch[self.offset] - else: - break + return islice(ex_iterator, self.offset, None, self.step) def shuffle_data_sources(self, generator: np.random.Generator) -> "StepExamplesIterable": return StepExamplesIterable(