release updates (#8394)

* release updates (#8378) * [tutorial] fixed missing RIR scripts file. (#8257) Signed-off-by: Xuesong Yang <[email protected]> * add values to en tts dict (#7879) Signed-off-by: Mariana Graterol Fuenmayor <[email protected]> * mcore ds fix Signed-off-by: Dmytro Pykhtar <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update mcore Signed-off-by: dimapihtar <[email protected]> * revert asr files Signed-off-by: dimapihtar <[email protected]> * add comments Signed-off-by: dimapihtar <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add support for mcore mock dataset Signed-off-by: dimapihtar <[email protected]> * update mcore version Signed-off-by: dimapihtar <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update gpt cfg Signed-off-by: dimapihtar <[email protected]> * update mcore commit Signed-off-by: dimapihtar <[email protected]> * fix Bert unit tests Signed-off-by: dimapihtar <[email protected]> * update bert tests Signed-off-by: dimapihtar <[email protected]> * fix bert mcore test Signed-off-by: dimapihtar <[email protected]> * fix gpt jenkins tests Signed-off-by: dimapihtar <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add support for dict data input type Signed-off-by: dimapihtar <[email protected]> * add mock ds test Signed-off-by: dimapihtar <[email protected]> * add test for dict data input type Signed-off-by: dimapihtar <[email protected]> * mcore ds fix Signed-off-by: dimapihtar <[email protected]> * data input fix Signed-off-by: dimapihtar <[email protected]> --------- Signed-off-by: Xuesong Yang <[email protected]> Signed-off-by: Mariana Graterol Fuenmayor <[email protected]> Signed-off-by: Dmytro Pykhtar <[email protected]> Signed-off-by: dimapihtar <[email protected]> Signed-off-by: Dmytro Pykhtar <[email protected]> Co-authored-by: Xuesong Yang <[email protected]> Co-authored-by: Mariana <[email protected]> Co-authored-by: Dmytro Pykhtar <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Pablo Garay <[email protected]> * Update megatron_gpt_model.py Signed-off-by: Dmytro Pykhtar <[email protected]> --------- Signed-off-by: Xuesong Yang <[email protected]> Signed-off-by: Mariana Graterol Fuenmayor <[email protected]> Signed-off-by: Dmytro Pykhtar <[email protected]> Signed-off-by: dimapihtar <[email protected]> Signed-off-by: Dmytro Pykhtar <[email protected]> Co-authored-by: Dmytro Pykhtar <[email protected]> Co-authored-by: Xuesong Yang <[email protected]> Co-authored-by: Mariana <[email protected]> Co-authored-by: Dmytro Pykhtar <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Pablo Garay <[email protected]> Co-authored-by: Eric Harper <[email protected]> Signed-off-by: Alexandros Koumparoulis <[email protected]>
NVIDIA · Feb 26, 2024 · 03abfa6 · 03abfa6
1 parent 5d3ab6a
commit 03abfa6
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 30 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -3698,7 +3698,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
        model.activations_checkpoint_method='block' \
        model.activations_checkpoint_granularity='full' \
        model.activations_checkpoint_num_layers=1 \
-       model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
+       model.data.data_prefix='{train:[1.0,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document],validation:[/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document],test:[/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document]}' \
        model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings"
         // commented out to save time on github ci @adithyare
         //sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
@@ -5243,34 +5243,34 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
         }
       }
       failFast true
-      //parallel {
-        //stage('MockGPTDataset') {
-        //  steps {
-        //    sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
-        //    trainer.max_steps=10 \
-        //    trainer.limit_val_batches=7 \
-        //    trainer.val_check_interval=10 \
-        //    exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
-        //    model.data.data_impl=mock \
-        //    model.data.data_prefix=[] \
-        //    "
-        //    sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
-        //  }
-        //}
-      //stage('MockT5Dataset') {
-      steps {
-        sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
-        trainer.max_steps=10 \
-        trainer.limit_val_batches=3 \
-        trainer.val_check_interval=10 \
-        exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
-        model.data.data_impl=mock \
-        model.data.data_prefix=[] \
-        "
-        sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results"
+      parallel {
+        stage('MockGPTDataset') {
+          steps {
+            sh "python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
+            trainer.max_steps=10 \
+            trainer.limit_val_batches=7 \
+            trainer.val_check_interval=10 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
+            model.data.data_impl=mock \
+            model.data.data_prefix=[] \
+            "
+            sh "rm -rf examples/nlp/language_modeling/gpt_pretrain_results"
+          }
+        }
+        stage('MockT5Dataset') {
+          steps {
+            sh "python examples/nlp/language_modeling/megatron_t5_pretraining.py \
+            trainer.max_steps=10 \
+            trainer.limit_val_batches=3 \
+            trainer.val_check_interval=10 \
+            exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
+            model.data.data_impl=mock \
+            model.data.data_prefix=[] \
+            "
+            sh "rm -rf examples/nlp/language_modeling/t5_pretrain_results"
+          }
+        }
       }
-      //}
-      //}
     }
 
     stage('L2: TTS Fast dev runs 1') {

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -1204,12 +1204,11 @@ def build_train_valid_test_datasets(self):
         # Setting N = 1 we force E to be 1 as well
         train_valid_test_num_samples = [max_train_steps * global_batch_size, 1, 1]
 
-        mock_dataset = self.cfg.data.get("mock_dataset", False)
+        mock_dataset = True if self.cfg.data.get("data_impl", "mmap") == "mock" else False
         kwargs = {
             "is_built_on_rank": is_dataset_built_on_rank,
             "random_seed": self.cfg.seed,
             "sequence_length": self.cfg.data.seq_length,
-            "split": self.cfg.data.splits_string,
             "path_to_cache": self.cfg.data.index_mapping_dir,
             "tokenizer": self.tokenizer,
             "reset_position_ids": self.reset_position_ids,
@@ -1218,11 +1217,13 @@ def build_train_valid_test_datasets(self):
             "mock": mock_dataset,
         }
 
+        # support for dict data input type
         if isinstance(self.cfg.data.data_prefix, DictConfig):
             _pref = self.cfg.data.data_prefix
             kwargs['blend_per_split'] = [_pref['train'], _pref['validation'], _pref['test']]
         else:
             kwargs['blend'] = self.cfg.data.data_prefix
+            kwargs["split"] = self.cfg.data.splits_string
 
         if self.cfg.data.get('add_fim', False):
             dataset_config = GPTFIMDatasetConfig(self.tokenizer, self.cfg.data.fim, **kwargs)