Added memory logger (#8395)

* Added memory logger Signed-off-by: Selvaraj Anandaraj <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Selvaraj Anandaraj <[email protected]> Co-authored-by: Selvaraj Anandaraj <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Harper <[email protected]>
NVIDIA · Feb 26, 2024 · 47223f9 · 47223f9
1 parent 4668047
commit 47223f9
Showing 1 changed file with 7 additions and 0 deletions.
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -331,6 +331,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
         self.get_attention_mask_from_fusion = self.cfg.get('get_attention_mask_from_fusion', True)
         self.initialize_ub = self.cfg.get('ub_tp_comm_overlap', False)
         self.log_train_loss = bool(int(os.getenv("NEMO_LOG_TRAIN_LOSS", 1)))
+        self.log_memory_usage = bool(int(os.getenv("NEMO_LOG_MEMORY_USAGE", 0)))
         self.loss_broadcast_src_rank = None
 
         self.inference_params = None
@@ -690,6 +691,12 @@ def training_step(self, dataloader_iter, batch_idx):
             self.allreduce_first_last_embeddings()
             self.megatron_timer_stop('allreduce_first_last_embeddings')
 
+        if self.log_memory_usage:
+            mem_reserved = torch.cuda.max_memory_reserved()
+            self.log(
+                'peak_memory_usage', mem_reserved, prog_bar=True, rank_zero_only=True, batch_size=1,
+            )
+
         ## logging
         if self.log_train_loss:
             # When using pipeline parallelism, loss is calculated only in the last pipeline stage and