pytorch · lessw2020 · Apr 5, 2024 · Mar 29, 2024 · Mar 30, 2024 · Mar 30, 2024
diff --git a/.flake8 b/.flake8
@@ -7,8 +7,9 @@ max-line-length = 120
 # N812 ignored because import torch.nn.functional as F is PyTorch convention
 # N817 ignored because importing using acronyms is convention (DistributedDataParallel as DDP)
 # E731 allow usage of assigning lambda expressions
+# N803,N806 allow caps and mixed case in function params. This is to work with Triton kernel coding style.
 ignore =
-    E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,N812,N817,E731
+    E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,N812,N817,E731,N803,N806
     # shebang has extra meaning in fbcode lints, so I think it's not worth trying
     # to line this up with executable bit
     EXE001,

diff --git a/run_llama_train.sh b/run_llama_train.sh
@@ -24,6 +24,6 @@ if [ $# -ne 0 ]; then
     overrides="$*"
 fi
 
-torchrun --nproc_per_node=${NGPU} --rdzv_endpoint="localhost:5972" \
+torchrun --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
 --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
 train.py --job.config_file ${CONFIG_FILE} $overrides
diff --git a/torchtrain/config_manager.py b/torchtrain/config_manager.py
@@ -117,6 +117,12 @@ def __init__(self):
             default="debugmodel",
             help="which model config to train",
         )
+        self.parser.add_argument(
+            "--model.norm_type",
+            type=str,
+            default="rmsnorm",
+            help="Layer Normalization type to use [layernorm, np_layernorm, rmsnorm, fused_rmsnorm]",
+        )
         self.parser.add_argument(
             "--model.tokenizer_path",
             type=str,

diff --git a/torchtrain/models/llama/model.py b/torchtrain/models/llama/model.py
@@ -7,6 +7,7 @@
 import torch
 import torch.nn.functional as F
 from torch import nn
+from torchtrain.models.norms import create_norm
 
 
 @dataclass
@@ -25,57 +26,7 @@ class ModelArgs:
     depth_init: bool = (
         True  # initialization uses each unique layer_id or total model layer count
     )
-
-
-class RMSNorm(torch.nn.Module):
-    """
-    Initialize the RMSNorm normalization layer.
-
-    Args:
-        dim (int): The dimension of the input tensor.
-        eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
-
-    Attributes:
-        eps (float): A small value added to the denominator for numerical stability.
-        weight (nn.Parameter): Learnable scaling parameter.
-
-    """
-
-    def __init__(self, dim: int, eps: float = 1e-6):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.empty(dim))
-        self.reset_parameters()
-
-    def _norm(self, x: torch.Tensor):
-        """
-        Apply the RMSNorm normalization to the input tensor.
-
-        Args:
-            x (torch.Tensor): The input tensor.
-
-        Returns:
-            torch.Tensor: The normalized tensor.
-
-        """
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-
-    def forward(self, x: torch.Tensor):
-        """
-        Forward pass through the RMSNorm layer.
-
-        Args:
-            x (torch.Tensor): The input tensor.
-
-        Returns:
-            torch.Tensor: The output tensor after applying RMSNorm.
-
-        """
-        output = self._norm(x.float()).type_as(x)
-        return output * self.weight
-
-    def reset_parameters(self):
-        torch.nn.init.ones_(self.weight)
+    norm_type: str = "rmsnorm"
 
 
 def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
@@ -375,8 +326,13 @@ def __init__(self, layer_id: int, model_args: ModelArgs):
         )
         self.layer_id = layer_id
         self.num_layers = model_args.n_layers
-        self.attention_norm = RMSNorm(model_args.dim, eps=model_args.norm_eps)
-        self.ffn_norm = RMSNorm(model_args.dim, eps=model_args.norm_eps)
+
+        self.attention_norm = create_norm(
+            model_args.norm_type, dim=model_args.dim, eps=model_args.norm_eps
+        )
+        self.ffn_norm = create_norm(
+            model_args.norm_type, dim=model_args.dim, eps=model_args.norm_eps
+        )
 
         if model_args.depth_init:
             self.weight_init_std = 0.02 / (2 * (self.layer_id + 1)) ** 0.5
@@ -441,7 +397,10 @@ def __init__(self, model_args: ModelArgs):
         for layer_id in range(model_args.n_layers):
             self.layers.append(TransformerBlock(layer_id, model_args))
 
-        self.norm = RMSNorm(model_args.dim, eps=model_args.norm_eps)
+        self.norm = create_norm(
+            model_args.norm_type, dim=model_args.dim, eps=model_args.norm_eps
+        )
+
         self.output = nn.Linear(model_args.dim, model_args.vocab_size, bias=False)
         self.init_weights()