Skip to content

Commit

Permalink
Add llama 3.1 recipes (#11273)
Browse files Browse the repository at this point in the history
* add llama 3.1 recipes

Signed-off-by: Chen Cui <[email protected]>

* Apply isort and black reformatting

Signed-off-by: cuichenx <[email protected]>

* fix pylint

Signed-off-by: Chen Cui <[email protected]>

* Fix llama3.1 wrong config in io.json

---------

Signed-off-by: Chen Cui <[email protected]>
Signed-off-by: cuichenx <[email protected]>
Co-authored-by: cuichenx <[email protected]>
Co-authored-by: Ao Tang <[email protected]>
  • Loading branch information
3 people authored Nov 13, 2024
1 parent a9a959c commit 3625d78
Show file tree
Hide file tree
Showing 9 changed files with 938 additions and 18 deletions.
7 changes: 6 additions & 1 deletion nemo/collections/llm/gpt/model/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,12 @@ def make_vocab_size_divisible_by(vocab_size):
base //= 2
return base

output = LlamaConfig(
if getattr(source, 'rope_scaling', None) is not None and source.rope_scaling.get('rope_type') == 'llama3':
# Apply Llama3.1 customize rope scaling
cls = Llama31Config
else:
cls = LlamaConfig
output = cls(
num_layers=source.num_hidden_layers,
hidden_size=source.hidden_size,
ffn_hidden_size=source.intermediate_size,
Expand Down
4 changes: 4 additions & 0 deletions nemo/collections/llm/recipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
llama3_70b,
llama3_70b_16k,
llama3_70b_64k,
llama31_8b,
llama31_70b,
llama31_405b,
mamba2_1_3b,
mamba2_2_7b,
Expand Down Expand Up @@ -82,6 +84,8 @@
"llama3_70b",
"llama3_70b_16k",
"llama3_70b_64k",
"llama31_8b",
"llama31_70b",
"llama31_405b",
"mamba2_130m",
"mamba2_370m",
Expand Down
19 changes: 17 additions & 2 deletions nemo/collections/llm/recipes/finetune_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import nemo_run as run
import pytorch_lightning as pl
import torch

import nemo.lightning as nl
from nemo.collections import llm
Expand Down Expand Up @@ -82,7 +83,7 @@ def default_finetune_recipe(
def default_finetune_trainer(
tensor_parallelism=1,
pipeline_parallelism=1,
pipeline_parallelism_type=None,
pipeline_parallelism_type=torch.bfloat16,
virtual_pipeline_parallelism=None,
context_parallelism=1,
sequence_parallelism=False,
Expand All @@ -93,6 +94,19 @@ def default_finetune_trainer(
limit_val_batches=None,
val_check_interval=30,
):
"""
Create a default fine-tuning trainer for any model.
This function sets up a template for strategy and trainer.
Args:
See docstrings of MegatronStrategy and Trainer.
Returns:
run.Config: Config for a finetuning trainer.
See usages of this in recipes for further details.
"""
strategy = run.Config(
nl.MegatronStrategy,
tensor_model_parallel_size=tensor_parallelism,
Expand Down Expand Up @@ -125,7 +139,8 @@ def default_finetune_trainer(

def nemo_resume(model_id: str) -> run.Config[nl.AutoResume]:
"""
Configure automatic resumption from a NeMo checkpoint converted from Huggingface for https://huggingface.co/{model_id}.
Configure automatic resumption from a NeMo checkpoint converted from Huggingface for
https://huggingface.co/{model_id}.
This NeMo checkpoint should be converted from Huggingface beforehand, using nemo.collections.llm.import_ckpt.
When converting the checkpoint, the NeMo checkpoint will be saved in NEMO_HOME (set to ~/.cache/nemo by default).
Expand Down
118 changes: 110 additions & 8 deletions nemo/collections/llm/recipes/llama31_405b.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from nemo import lightning as nl
from nemo.collections.llm.api import finetune, pretrain
from nemo.collections.llm.gpt.data.mock import MockDataModule
from nemo.collections.llm.gpt.data.packed_sequence import PackedSequenceSpecs
from nemo.collections.llm.gpt.model.llama import Llama31Config405B, LlamaModel
from nemo.collections.llm.peft.lora import LoRA
from nemo.collections.llm.recipes.finetune_default import default_finetune_recipe
Expand All @@ -33,6 +34,7 @@
from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
userbuffers_bf16_h100_h16384_tp8_cp2_mbs1_seqlen8192,
)
from nemo.lightning.pytorch.callbacks import GarbageCollectionCallback
from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
from nemo.utils.exp_manager import TimingCallback

Expand Down Expand Up @@ -248,6 +250,9 @@ def finetune_recipe(
num_nodes: int = 3,
num_gpus_per_node: int = 8,
peft_scheme: Optional[str] = 'lora',
seq_length: Optional[int] = None,
packed_sequence: Optional[bool] = None,
performance_mode: bool = False,
) -> run.Partial:
"""
Create a fine-tuning recipe for Llama3.1 405B model.
Expand All @@ -261,8 +266,11 @@ def finetune_recipe(
name (str): Name of the fine-tuning run.
num_nodes (int): Number of compute nodes to use.
num_gpus_per_node (int): Number of GPUs per node.
peft_scheme (Optional[str]): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
peft_scheme (Optional[str]): Name of the peft scheme to use for finetuning. Allowed values: 'lora'/'none'/None.
seq_length (int): Maximum number of tokens per microbatch.
packed_sequence (Optional[bool]): If true, fine-tuning sequences will be packed into batches up to the given
maximum seq_length for better efficiency. By default, this value equals performance_mode.
performance_mode (bool): If true, enables optimizations for maximum performance.
Returns:
run.Partial: Partial configuration for fine-tuning.
Expand All @@ -279,22 +287,116 @@ def finetune_recipe(
This recipe uses the SQuAD dataset for fine-tuning. Be aware that fine-tuning a 405B model
requires substantial computational resources.
"""
if packed_sequence is None:
packed_sequence = performance_mode

if seq_length is None:
seq_length = 2048

if num_nodes is None:
if peft_scheme is None or peft_scheme.lower() == 'none':
num_nodes = 12
elif peft_scheme.lower() == 'lora':
num_nodes = 3

recipe = default_finetune_recipe(
model(), "meta-llama/Meta-Llama-3.1-405B", dir, name, num_nodes, num_gpus_per_node
model(), "meta-llama/Llama-3.1-405B", dir, name, num_nodes, num_gpus_per_node, packed_sequence
)

if peft_scheme is None or peft_scheme.lower() == 'none':
assert num_nodes >= 4
recipe.trainer.strategy.tensor_model_parallel_size = 8
recipe.trainer.strategy.pipeline_model_parallel_size = 4
recipe.trainer.strategy.pipeline_model_parallel_size = 14
recipe.data.global_batch_size = 6
recipe.optim.config.lr = 5e-6
elif peft_scheme.lower() == 'lora':
recipe.peft = run.Config(LoRA)
recipe.peft.dim = 16
recipe.peft.alpha = 32
recipe.peft.target_modules = ['linear_qkv']
recipe.optim.config.use_distributed_optimizer = False

# some settings currently do not function correctly with LoRA
recipe.model.config.cross_entropy_loss_fusion = False
recipe.trainer.strategy.tensor_model_parallel_size = 4
recipe.trainer.strategy.pipeline_model_parallel_size = 6
recipe.trainer.strategy.virtual_pipeline_parallelism = 7
recipe.data.global_batch_size = 128
recipe.trainer.strategy.virtual_pipeline_model_parallel_size = 7
recipe.data.global_batch_size = 6
recipe.optim.config.lr = 1e-4
else:
raise ValueError(f"Unrecognized peft scheme: {peft_scheme}")

# Sequence length settings in the model and dataset must agree
recipe.model.config.seq_length = seq_length
recipe.data.seq_length = seq_length
if packed_sequence:
recipe.data.dataset_kwargs = {'pad_to_max_length': True}
recipe.data.packed_sequence_specs = run.Config(PackedSequenceSpecs, packed_sequence_size=seq_length)

if performance_mode:
recipe = finetune_performance_optimizations(recipe, peft_scheme)

return recipe


def finetune_performance_optimizations(
recipe: run.Partial,
peft_scheme: str,
) -> run.Partial:
"""
Modify the given recipe to optimize settings for performance.
This method enables performance optimizations that may not be suitable for all use cases.
Intended to build upon the standard fine-tuning recipe.
Args:
recipe (run.Partial): Base fine-tuning recipe to which performance optimizations will be added
peft_scheme (str): Name of the peft scheme to use for fine-tuning. Allowed values: 'lora', 'none'/None.
Returns:
run.Partial: Partial configuration for performance-optimized fine-tuning.
Note:
Use this method with caution and only when you need maximum performance.
It may not be suitable for all hardware configurations or use cases.
"""

if not hasattr(recipe.trainer, "callbacks"):
recipe.trainer.callbacks = []

if peft_scheme is None or peft_scheme.lower() == 'none':
# Note: limited support. This is not necessarily the most optimized setting
recipe.trainer.strategy.tensor_model_parallel_size = 8
recipe.trainer.strategy.pipeline_model_parallel_size = 14
recipe.trainer.plugins.grad_reduce_in_fp32 = False
recipe.trainer.strategy.ddp = run.Config(
DistributedDataParallelConfig,
check_for_nan_in_grad=True,
grad_reduce_in_fp32=False,
overlap_grad_reduce=True,
overlap_param_gather=True,
average_in_collective=True,
)
recipe.trainer.callbacks.append(
run.Config(
MegatronCommOverlapCallback,
tp_comm_overlap=True,
defer_embedding_wgrad_compute=True,
wgrad_deferral_limit=22,
)
)
else:
recipe.trainer.strategy.tensor_model_parallel_size = 4
recipe.trainer.strategy.pipeline_model_parallel_size = 6
recipe.trainer.strategy.virtual_pipeline_model_parallel_size = 7

recipe.trainer.strategy.sequence_parallel = True

recipe.trainer.callbacks.append(run.Config(TimingCallback))
recipe.trainer.callbacks.append(
run.Config(
GarbageCollectionCallback,
100,
100,
)
)

return recipe
Loading

0 comments on commit 3625d78

Please sign in to comment.