Skip to content

Commit

Permalink
Merge branch 'main' into cherry-pick-main-c98b9c18341708de2d1e8abd324…
Browse files Browse the repository at this point in the history
…9c6ae0d8869e5

Signed-off-by: akoumpa <[email protected]>
  • Loading branch information
akoumpa authored Feb 26, 2024
2 parents 1097428 + a7709b8 commit 458b20a
Show file tree
Hide file tree
Showing 37 changed files with 1,151 additions and 1,650 deletions.
107 changes: 0 additions & 107 deletions .github/workflows/import-test.yml

This file was deleted.

191 changes: 159 additions & 32 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ pipeline {
steps {
sh 'git clone https://github.com/NVIDIA/TransformerEngine.git && \
cd TransformerEngine && \
git fetch origin da30634a6c9ccdbb6c587b6c93b1860e4b038204 && \
git fetch origin 8c9abbb80dba196f086b8b602a7cf1bce0040a6a && \
git checkout FETCH_HEAD && \
git submodule init && git submodule update && \
NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .'
Expand All @@ -91,7 +91,7 @@ pipeline {
steps {
sh 'git clone https://github.com/NVIDIA/Megatron-LM.git && \
cd Megatron-LM && \
git checkout 98da3792f53c80ac9e865eab49a6fa5ccc293d22 && \
git checkout 5f9c870f9f24b482509699d206a9dbb00958f6fc && \
pip install .'
}
}
Expand All @@ -115,6 +115,13 @@ pipeline {
sh 'python -c "import nemo.collections.tts as nemo_tts"'
}
}
stage('Import Checks'){
steps {
sh 'python tests/core_ptl/check_imports.py --domain "asr"'
sh 'python tests/core_ptl/check_imports.py --domain "nlp"'
sh 'python tests/core_ptl/check_imports.py --domain "tts"'
}
}
stage('L0: Unit Tests GPU') {
steps {
sh 'NEMO_NUMBA_MINVER=0.53 pytest -m "not pleasefixme" --with_downloads'
Expand Down Expand Up @@ -3478,6 +3485,90 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
sh "rm -rf examples/nlp/language_modeling/token_classification_results"
}
}
stage('L2: Megatron GPT Pretraining and Resume Training TETransformerLayerTP=2') {
when {
anyOf {
branch 'main'
changeRequest target: 'main'
}
}
failFast true
steps {
sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=3 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
++model.name=megatron_gpt_full_te_layer_autocast \
model.mcore_gpt=True \
model.tensor_model_parallel_size=2 \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=1 \
model.optim.sched.constant_steps=1 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.normalization=layernorm1p \
model.bias_activation_fusion=True \
model.bias_dropout_add_fusion=True \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=null \
model.activations_checkpoint_granularity=null \
model.activations_checkpoint_num_layers=null \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=6 \
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
exp_manager.resume_if_exists=True \
++model.name=megatron_gpt_full_te_layer_autocast \
model.mcore_gpt=True \
model.tensor_model_parallel_size=2 \
model.optim.name=fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=2 \
model.optim.sched.min_lr=8e-5 \
model.max_position_embeddings=128 \
model.encoder_seq_length=128 \
model.data.seq_length=128 \
model.normalization=layernorm1p \
model.bias_activation_fusion=True \
model.bias_dropout_add_fusion=True \
model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \
model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \
model.num_layers=8 \
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=null \
model.activations_checkpoint_granularity=null \
model.activations_checkpoint_num_layers=null \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
sh "rm -rf examples/nlp/language_modeling/gpt_index_mappings"
}
}
// @chcui: model.cpu_offloading_num_layers=7 # temp workaround before m-lm !1124 is merged
stage('L2: Megatron GPT Pretraining and Resume Training TP=2') {
when {
Expand All @@ -3493,7 +3584,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
trainer.limit_val_batches=1.0 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=3 \
trainer.precision=16 \
Expand Down Expand Up @@ -3528,7 +3619,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
trainer.limit_val_batches=1.0 \
trainer.accumulate_grad_batches=1 \
trainer.max_steps=6 \
trainer.precision=16 \
Expand Down Expand Up @@ -3607,7 +3698,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
model.activations_checkpoint_method='block' \
model.activations_checkpoint_granularity='full' \
model.activations_checkpoint_num_layers=1 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document],validation:[/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document],test:[/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document]}' \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
// commented out to save time on github ci @adithyare
//sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
Expand Down Expand Up @@ -4200,6 +4291,42 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
sh "rm -rf /home/TestData/nlp/lora_tuning_tp2"
}
}
stage('L2: Megatron GPT PEFT Lora TP=2 SP') {
when {
anyOf {
branch 'main'
changeRequest target: 'main'
}
}
failFast true
steps {
sh "rm -rf /home/TestData/nlp/lora_tuning_tp2_sp"
sh "python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=9999 \
trainer.max_steps=3 \
trainer.val_check_interval=3 \
++trainer.limit_val_batches=2 \
trainer.precision=16 \
exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2 \
model.pipeline_model_parallel_size=1 \
model.tensor_model_parallel_size=2 \
model.sequence_parallel=true \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/TP2/megatron_gpt_tp2.nemo \
model.peft.peft_scheme='lora' \
model.answer_only_loss=True \
model.micro_batch_size=1 \
model.global_batch_size=1 \
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.train_ds.concat_sampling_probabilities=[1.0] \
model.data.train_ds.num_workers=0 \
model.data.validation_ds.num_workers=0 \
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.validation_ds.names=[quarel]"
sh "rm -rf /home/TestData/nlp/lora_tuning_tp2_sp"
}
}
stage('L2: Megatron GPT Eval') {
when {
anyOf {
Expand Down Expand Up @@ -5116,34 +5243,34 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
}
}
failFast true
//parallel {
//stage('MockGPTDataset') {
// steps {
// sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
// trainer.max_steps=10 \
// trainer.limit_val_batches=7 \
// trainer.val_check_interval=10 \
// exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
// model.data.data_impl=mock \
// model.data.data_prefix=[] \
// "
// sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
// }
//}
//stage('MockT5Dataset') {
steps {
sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.max_steps=10 \
trainer.limit_val_batches=3 \
trainer.val_check_interval=10 \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
model.data.data_impl=mock \
model.data.data_prefix=[] \
"
sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results"
parallel {
stage('MockGPTDataset') {
steps {
sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.max_steps=10 \
trainer.limit_val_batches=7 \
trainer.val_check_interval=10 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
model.data.data_impl=mock \
model.data.data_prefix=[] \
"
sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
}
}
stage('MockT5Dataset') {
steps {
sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.max_steps=10 \
trainer.limit_val_batches=3 \
trainer.val_check_interval=10 \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
model.data.data_impl=mock \
model.data.data_prefix=[] \
"
sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results"
}
}
}
//}
//}
}

stage('L2: TTS Fast dev runs 1') {
Expand Down
Loading

0 comments on commit 458b20a

Please sign in to comment.