Skip to content

Commit

Permalink
Change the megatron config lr scheduler default and fix to change par…
Browse files Browse the repository at this point in the history
…titions script (#8094)

* fix the config file issues

Signed-off-by: Shantanu Acharya <[email protected]>

* disable sequence parallel for mcore

Signed-off-by: Shantanu Acharya <[email protected]>

---------

Signed-off-by: Shantanu Acharya <[email protected]>
  • Loading branch information
shan18 authored Dec 29, 2023
1 parent 13c4d5e commit 7faeee8
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 26 deletions.
26 changes: 13 additions & 13 deletions examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ trainer:
logger: False # logger provided by exp_manager
enable_checkpointing: False
use_distributed_sampler: False
max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
log_every_n_steps: 10
val_check_interval: 100
Expand All @@ -34,7 +34,7 @@ exp_manager:
name: null
resume_if_exists: True
resume_ignore_no_checkpoint: True
resume_from_checkpoint: ${model.resume_from_checkpoint}
resume_from_checkpoint: ${model.resume_from_checkpoint}
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
Expand Down Expand Up @@ -99,7 +99,7 @@ model:
type: 'GPT2BPETokenizer'
model: null
vocab_file: null
merge_file: null
merge_file: null
delimiter: null # only used for tabular tokenizer
sentencepiece_legacy: False # Legacy=True allows you to add special tokens to sentencepiece tokenizers.

Expand Down Expand Up @@ -144,7 +144,7 @@ model:
# These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
# See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
# 'full' will checkpoint the entire transformer layer.
activations_checkpoint_granularity: null # 'selective' or 'full'
activations_checkpoint_granularity: null # 'selective' or 'full'
activations_checkpoint_method: null # 'uniform', 'block'
# 'uniform' divides the total number of transformer layers and checkpoints the input activation
# of each chunk at the specified granularity. When used with 'selective', 'uniform' checkpoints all attention blocks in the model.
Expand Down Expand Up @@ -175,9 +175,9 @@ model:
## Transformer Engine
transformer_engine: False
fp8: False # enables fp8 in TransformerLayer forward
fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3
fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID
fp8_margin: 0 # scaling margin
fp8_margin: 0 # scaling margin
fp8_interval: 1 # scaling update interval
fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor
fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
Expand All @@ -193,13 +193,13 @@ model:

## Flash Attention
use_flash_attention: False # Use flash attention in self-attention module, this config does nothing when transformer_engine=True

data:
# Path to data must be specified by the user.
# Supports List, String and Dictionary
# List : can override from the CLI: "model.data.data_prefix=[.5,/raid/data/pile/my-gpt3_00_text_document,.5,/raid/data/pile/my-gpt3_01_text_document]",
# Or see example below:
# data_prefix:
# Or see example below:
# data_prefix:
# - .5
# - /raid/data/pile/my-gpt3_00_text_document
# - .5
Expand Down Expand Up @@ -231,18 +231,18 @@ model:
end_step: 10 # Global batch to end profiling
ranks: [0] # Global rank IDs to profile
gen_shape: False # Generate model and kernel details including input shapes

optim:
name: fused_adam
lr: 2e-4
weight_decay: 0.01
betas:
weight_decay: 0.01
betas:
- 0.9
- 0.98
sched:
name: CosineAnnealing
warmup_steps: 500
constant_steps: 50000
constant_steps: 0
min_lr: 2e-5

gc_interval: 0
Expand Down
30 changes: 18 additions & 12 deletions examples/nlp/language_modeling/megatron_change_num_partitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
### Only Tensor Parallelism conversion ###
To the above commands, add the following argument: `--tp_conversion_only`
To the above commands, add the following argument: `--tp_conversion_only`
# Note: This requires that the pipeline_model_parallel_size and tgt_pipeline_model_parallel_size is set to 1.
Expand All @@ -95,9 +95,9 @@
### Model Classes ###
# NOTE: Conversion of other model types.
# NOTE: Conversion of other model types.
# Default model type is MegatronGPTModel, if you want another model you need to pass classpath of the model
# For example - MegatronT5Model -
# For example - MegatronT5Model -
python megatron_change_num_partitions.py \
...
Expand All @@ -106,7 +106,7 @@
# Additional arguments:
--num_gpu_per_node: Number of GPUs per node. Default is 8.
--megatron_legacy: Whether the model is a legacy Megatron model or not. Default is False. May be unsuported for
--megatron_legacy: Whether the model is a legacy Megatron model or not. Default is False. May be unsuported for
Pipeline Parallelism change.
--tokenizer_model_path: Path to tokenizer model. Default is None. When not None, overrides the tokenizer model path
in the model config.
Expand Down Expand Up @@ -146,23 +146,29 @@ def force_cpu_model(cfg):
# temporarily set to cpu
original_cpu_init = cfg.get('use_cpu_initialization', False)
if 'megatron_amp_O2' in cfg:
key = 'megatron_amp_O2'
amp_o2_key = 'megatron_amp_O2'
original_amp_o2 = cfg.megatron_amp_O2
elif 'megatron_amp_02' in cfg:
key = 'megatron_amp_02'
amp_o2_key = 'megatron_amp_02'
original_amp_o2 = cfg.megatron_amp_02
else:
key, original_amp_o2 = None, None
amp_o2_key, original_amp_o2 = None, None

# Set new values
cfg.use_cpu_initialization = True
if key is not None:
cfg[key] = False
if amp_o2_key is not None:
cfg[amp_o2_key] = False

# Disable sequence parallelism - Not disabling this gives error when converting the the model to TP=1
original_sequence_parallel = cfg.get('sequence_parallel', None)
cfg.sequence_parallel = False

# Setup restore dict
restore_dict = {'use_cpu_initialization': original_cpu_init} # 'megatron_amp_O2': original_amp_o2
if key is not None:
restore_dict[key] = original_amp_o2
if amp_o2_key is not None:
restore_dict[amp_o2_key] = original_amp_o2
if original_sequence_parallel is not None:
restore_dict['sequence_parallel'] = original_sequence_parallel

return cfg, restore_dict

Expand Down Expand Up @@ -1239,7 +1245,7 @@ def main():

"""
Under VP convention
Notation :
Notation :
Stage = PP rank
Number = GPT model / layer index
Ignore TP - every PP has all TP corresponding to that PP
Expand Down
5 changes: 4 additions & 1 deletion scripts/nlp_language_modeling/convert_nemo_gpt_to_mcore.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
--target_pipeline_model_parallel_size=1
2) extract your nemo file to a folder with
tar -xvf filename.nemo
Then, run this conversion script:
python convert_nemo_gpt_to_mcore.py \
--in-folder <path to extracted, TP1 PP1 legacy checkpoint folder> \
Expand Down Expand Up @@ -186,6 +186,9 @@ def restore_model(nemo_file, cpu_only=False):
)
model_config.use_cpu_initialization = cpu_only

if model_config.get('sequence_parallel', None):
model_config.sequence_parallel = False

# To copy weights in the original precision, we have to turn on O2.
orig_megatron_amp_O2_value = model_config.megatron_amp_O2
if "target" in model_config and model_config.target.endswith("MegatronGPTSFTModel"):
Expand Down

0 comments on commit 7faeee8

Please sign in to comment.