Add GQA config to megatron gpt model (#7096)

* Add GQA config in gpt config file Signed-off-by: jasonwan <[email protected]> * Verify mcore is enabled when using GQA Signed-off-by: jasonwan <[email protected]> --------- Signed-off-by: jasonwan <[email protected]>
NVIDIA · Jul 25, 2023 · 2320d50 · 2320d50
1 parent 2b6cbe7
commit 2320d50
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 0 deletions.
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -87,6 +87,7 @@ model:
   overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
   batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
   seq_len_interpolation_factor: null # RoPE Interpolation factor for sequence length. This is used to build long-context models with RoPE ex: https://arxiv.org/abs/2306.15595.
+  num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used.
 
   tokenizer:
     library: 'megatron'

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -320,6 +320,10 @@ def model_provider_func(self, pre_process, post_process):
                 rotary_percent=self.cfg.get('rotary_percentage', 1.0),
             )
         else:
+            assert (
+                self.cfg.get('num_query_groups', None) is None
+            ), "Group Query Attention is only supported in Megatron Core. Set 'mcore_gpt' to use GQA."
+
             model = GPTModel(
                 config=self.model_parallel_config,
                 vocab_size=self.cfg.get('override_vocab_size', self.padded_vocab_size),