Change the megatron config lr scheduler default and fix to change par…

…titions script (#8094) * fix the config file issues Signed-off-by: Shantanu Acharya <[email protected]> * disable sequence parallel for mcore Signed-off-by: Shantanu Acharya <[email protected]> --------- Signed-off-by: Shantanu Acharya <[email protected]>
NVIDIA · Dec 29, 2023 · 7faeee8 · 7faeee8
1 parent 13c4d5e
commit 7faeee8
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 26 deletions.
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -13,7 +13,7 @@ trainer:
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
-  max_epochs: -1 # PTL default. In practice, max_steps will be reached first. 
+  max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
   max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
   log_every_n_steps: 10
   val_check_interval: 100
@@ -34,7 +34,7 @@ exp_manager:
     name: null
   resume_if_exists: True
   resume_ignore_no_checkpoint: True
-  resume_from_checkpoint: ${model.resume_from_checkpoint} 
+  resume_from_checkpoint: ${model.resume_from_checkpoint}
   create_checkpoint_callback: True
   checkpoint_callback_params:
     monitor: val_loss
@@ -99,7 +99,7 @@ model:
     type: 'GPT2BPETokenizer'
     model: null
     vocab_file: null
-    merge_file: null 
+    merge_file: null
     delimiter: null # only used for tabular tokenizer
     sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
 
@@ -144,7 +144,7 @@ model:
   # These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
   # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
   # 'full' will checkpoint the entire transformer layer.
-  activations_checkpoint_granularity: null # 'selective' or 'full' 
+  activations_checkpoint_granularity: null # 'selective' or 'full'
   activations_checkpoint_method: null # 'uniform', 'block'
   # 'uniform' divides the total number of transformer layers and checkpoints the input activation
   # of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
@@ -175,9 +175,9 @@ model:
   ## Transformer Engine
   transformer_engine: False
   fp8: False # enables fp8 in TransformerLayer forward
-  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 
+  fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
   fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
-  fp8_margin: 0 # scaling margin 
+  fp8_margin: 0 # scaling margin
   fp8_interval: 1 # scaling update interval
   fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
@@ -193,13 +193,13 @@ model:
 
   ## Flash Attention
   use_flash_attention: False # Use flash attention in self-attention module, this config does nothing when transformer_engine=True
-  
+
   data:
    # Path to data must be specified by the user.
     # Supports List, String and Dictionary
     # List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
-    # Or see example below: 
-    # data_prefix: 
+    # Or see example below:
+    # data_prefix:
     #   - .5
     #   - /raid/data/pile/my-gpt3_00_text_document
     #   - .5
@@ -231,18 +231,18 @@ model:
     end_step: 10 # Global batch to end profiling
     ranks: [0] # Global rank IDs to profile
     gen_shape: False # Generate model and kernel details including input shapes
-  
+
   optim:
     name: fused_adam
     lr: 2e-4
-    weight_decay: 0.01 
-    betas: 
+    weight_decay: 0.01
+    betas:
     - 0.9
     - 0.98
     sched:
       name: CosineAnnealing
       warmup_steps: 500
-      constant_steps: 50000
+      constant_steps: 0
       min_lr: 2e-5
 
   gc_interval: 0

diff --git a/examples/nlp/language_modeling/megatron_change_num_partitions.py b/examples/nlp/language_modeling/megatron_change_num_partitions.py
@@ -78,7 +78,7 @@
 
 ### Only Tensor Parallelism conversion ###
 
-To the above commands, add the following argument: `--tp_conversion_only` 
+To the above commands, add the following argument: `--tp_conversion_only`
 
 # Note: This requires that the pipeline_model_parallel_size and tgt_pipeline_model_parallel_size is set to 1.
 
@@ -95,9 +95,9 @@
 
 ### Model Classes ###
 
-# NOTE: Conversion of other model types. 
+# NOTE: Conversion of other model types.
 # Default model type is MegatronGPTModel, if you want another model you need to pass classpath of the model
-# For example - MegatronT5Model - 
+# For example - MegatronT5Model -
 
 python megatron_change_num_partitions.py \
     ...
@@ -106,7 +106,7 @@
 # Additional arguments:
 
 --num_gpu_per_node: Number of GPUs per node. Default is 8.
---megatron_legacy: Whether the model is a legacy Megatron model or not. Default is False. May be unsuported for 
+--megatron_legacy: Whether the model is a legacy Megatron model or not. Default is False. May be unsuported for
     Pipeline Parallelism change.
 --tokenizer_model_path: Path to tokenizer model. Default is None. When not None, overrides the tokenizer model path
     in the model config.
@@ -146,23 +146,29 @@ def force_cpu_model(cfg):
         # temporarily set to cpu
         original_cpu_init = cfg.get('use_cpu_initialization', False)
         if 'megatron_amp_O2' in cfg:
-            key = 'megatron_amp_O2'
+            amp_o2_key = 'megatron_amp_O2'
             original_amp_o2 = cfg.megatron_amp_O2
         elif 'megatron_amp_02' in cfg:
-            key = 'megatron_amp_02'
+            amp_o2_key = 'megatron_amp_02'
             original_amp_o2 = cfg.megatron_amp_02
         else:
-            key, original_amp_o2 = None, None
+            amp_o2_key, original_amp_o2 = None, None
 
         # Set new values
         cfg.use_cpu_initialization = True
-        if key is not None:
-            cfg[key] = False
+        if amp_o2_key is not None:
+            cfg[amp_o2_key] = False
+
+        # Disable sequence parallelism - Not disabling this gives error when converting the the model to TP=1
+        original_sequence_parallel = cfg.get('sequence_parallel', None)
+        cfg.sequence_parallel = False
 
     # Setup restore dict
     restore_dict = {'use_cpu_initialization': original_cpu_init}  # 'megatron_amp_O2': original_amp_o2
-    if key is not None:
-        restore_dict[key] = original_amp_o2
+    if amp_o2_key is not None:
+        restore_dict[amp_o2_key] = original_amp_o2
+    if original_sequence_parallel is not None:
+        restore_dict['sequence_parallel'] = original_sequence_parallel
 
     return cfg, restore_dict
 
@@ -1239,7 +1245,7 @@ def main():
 
             """
             Under VP convention
-            Notation : 
+            Notation :
             Stage  = PP rank
             Number = GPT model / layer index
             Ignore TP - every PP has all TP corresponding to that PP

diff --git a/scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py b/scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py
@@ -37,7 +37,7 @@
     --target_pipeline_model_parallel_size=1
 2) extract your nemo file to a folder with
     tar -xvf filename.nemo
-        
+
 Then, run this conversion script:
 python convert_nemo_gpt_to_mcore.py \
  --in-folder <path to extracted, TP1 PP1 legacy checkpoint folder> \
@@ -186,6 +186,9 @@ def restore_model(nemo_file, cpu_only=False):
     )
     model_config.use_cpu_initialization = cpu_only
 
+    if model_config.get('sequence_parallel', None):
+        model_config.sequence_parallel = False
+
     # To copy weights in the original precision, we have to turn on O2.
     orig_megatron_amp_O2_value = model_config.megatron_amp_O2
     if "target" in model_config and model_config.target.endswith("MegatronGPTSFTModel"):