From f2ea3609c4bd1d48b1dddff0c9c0c44649cfd02c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 15 Feb 2024 17:34:36 -0700 Subject: [PATCH] Fix dreambooth data sampler issue (#8400) (#8413) * Turn on drop last * Some neva fixes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: yaoyu-33 Co-authored-by: yaoyu-33 <54727607+yaoyu-33@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: ataghibakhsh --- .../multimodal/models/text_to_image/dreambooth/dreambooth.py | 2 +- nemo/collections/multimodal/parts/utils.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py b/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py index ce82da9bd171..704f8b39371a 100644 --- a/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py +++ b/nemo/collections/multimodal/models/text_to_image/dreambooth/dreambooth.py @@ -487,7 +487,7 @@ def setup_training_data(self, cfg): global_batch_size=self.cfg.global_batch_size, data_parallel_rank=parallel_state.get_data_parallel_rank(), data_parallel_size=parallel_state.get_data_parallel_world_size(), - drop_last=False, + drop_last=True, ) self._train_dl = torch.utils.data.DataLoader( diff --git a/nemo/collections/multimodal/parts/utils.py b/nemo/collections/multimodal/parts/utils.py index c82e0cd37140..4d4e952db0ce 100644 --- a/nemo/collections/multimodal/parts/utils.py +++ b/nemo/collections/multimodal/parts/utils.py @@ -22,9 +22,11 @@ from pytorch_lightning.plugins.environments import TorchElasticEnvironment from transformers import CLIPImageProcessor +from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector from nemo.collections.nlp.parts.peft_config import PEFT_CONFIG_MAP from nemo.utils import AppState, logging +from nemo.utils.model_utils import inject_model_parallel_rank try: from megatron.core import dist_checkpointing @@ -361,6 +363,7 @@ def create_neva_model_and_processor(cfg): neva_cfg.activations_checkpoint_method = None neva_cfg.precision = trainer.precision neva_cfg.mm_cfg.llm.from_pretrained = cfg.get('base_model_file', None) + neva_cfg.apply_rope_fusion = False # neva_cfg.mm_cfg.vision_encoder.from_pretrained = None model = MegatronNevaModel.restore_from(