From 560e31f6ecdb0003d132719a1b7e448c79f236bb Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Thu, 27 Jun 2024 10:36:53 -0700 Subject: [PATCH] Use closed-formula to round by multiple (#9307) * Use closed-formula to round by multiple Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Co-authored-by: akoumpa Co-authored-by: Pablo Garay Signed-off-by: ashors1 --- .../stable_diffusion/encoders/modules.py | 22 ++++++++++++++----- .../language_modeling/megatron_base_model.py | 3 +-- nemo/lightning/base.py | 3 +-- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py b/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py index bff579bbca4fb..ab33532c3c1fa 100644 --- a/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py +++ b/nemo/collections/multimodal/modules/stable_diffusion/encoders/modules.py @@ -298,7 +298,7 @@ def encode(self, x): class BERTTokenizer(AbstractEncoder): - """ Uses a pretrained BERT tokenizer by huggingface. Vocab size: 30522 (?)""" + """Uses a pretrained BERT tokenizer by huggingface. Vocab size: 30522 (?)""" def __init__(self, device="cuda", vq_interface=True, max_length=77): super().__init__() @@ -530,7 +530,10 @@ def __init__( print(f"Downloading clip with", arch, version, cache_dir) self.device = device model, _, _ = open_clip.create_model_and_transforms( - arch, device=torch.device("cpu"), pretrained=version, cache_dir=cache_dir, + arch, + device=torch.device("cpu"), + pretrained=version, + cache_dir=cache_dir, ) del model.visual self.model = model @@ -669,7 +672,11 @@ def build_tokenizer(self, cfg): legacy=legacy, ) - _, self.text_transform = get_preprocess_fns(cfg, self.tokenizer, is_train=False,) + _, self.text_transform = get_preprocess_fns( + cfg, + self.tokenizer, + is_train=False, + ) self.max_length = cfg.text.get("max_position_embeddings") def load_model(self, cfg, state_dict): @@ -699,8 +706,7 @@ def load_model(self, cfg, state_dict): def _vocab_size_with_padding(self, orig_vocab_size, make_vocab_size_divisible_by, tensor_model_parallel_size): after = orig_vocab_size multiple = make_vocab_size_divisible_by * tensor_model_parallel_size - while (after % multiple) != 0: - after += 1 + after = ((after + multiple - 1) // multiple) * multiple return after def forward(self, text): @@ -765,7 +771,11 @@ def __init__( super().__init__() assert layer in self.LAYERS self.projection_dim = 1280 - model, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device("cpu"), pretrained=version,) + model, _, _ = open_clip.create_model_and_transforms( + arch, + device=torch.device("cpu"), + pretrained=version, + ) del model.visual self.model = model diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 8c423707b9895..ae659e7574964 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -581,8 +581,7 @@ def _vocab_size_with_padding(self, orig_vocab_size, make_vocab_size_divisible_by after = orig_vocab_size multiple = make_vocab_size_divisible_by * tensor_model_parallel_size - while (after % multiple) != 0: - after += 1 + after = ((after + multiple - 1) // multiple) * multiple logging.info( f'Padded vocab_size: {after}, original vocab_size: {orig_vocab_size}, dummy tokens: {after - orig_vocab_size}.' ) diff --git a/nemo/lightning/base.py b/nemo/lightning/base.py index ba5daf12f95fc..128ecb661efd8 100644 --- a/nemo/lightning/base.py +++ b/nemo/lightning/base.py @@ -26,8 +26,7 @@ def get_vocab_size( after = vocab_size multiple = make_vocab_size_divisible_by * config.tensor_model_parallel_size - while (after % multiple) != 0: - after += 1 + after = ((after + multiple - 1) // multiple) * multiple logging.info( f"Padded vocab_size: {after}, original vocab_size: {vocab_size}, dummy tokens:" f" {after - vocab_size}." )