pytorch-labs · weifengpy · May 21, 2024 · May 21, 2024 · May 21, 2024 · May 24, 2024
diff --git a/float8_experimental/float8_dynamic_linear.py b/float8_experimental/float8_dynamic_linear.py
@@ -22,7 +22,7 @@
     tensor_already_casted_to_fp8,
     to_fp8_no_autograd,
 )
-from float8_experimental.float8_utils import tensor_to_scale
+from float8_experimental.float8_utils import amax_to_scale, tensor_to_scale
 from torch._prims_common import suggest_memory_format
 
 
@@ -151,6 +151,7 @@ def __new__(cls, tensor: torch.Tensor, mm_config: ScaledMMConfig):
     def __init__(self, tensor: torch.Tensor, mm_config: ScaledMMConfig):
         self._tensor = tensor
         self._mm_config = mm_config
+        self._pre_computed_amax = None
 
     @classmethod
     def __torch_dispatch__(cls, func, types, args, kwargs=None):
@@ -190,9 +191,20 @@ def __repr__(self):
         return f"WeightWithDynamicFloat8CastTensor(tensor={self._tensor}, mm_config={self._mm_config})"
 
     def fsdp_pre_all_gather(self, mesh):
-        float8_tensor = cast_to_float8_e4m3fn(
-            self._tensor, self._mm_config, reduce_amax=True
-        )
+        if self._pre_computed_amax is not None:
+            scale = amax_to_scale(
+                self._pre_computed_amax,
+                torch.float8_e4m3fn,
+                self._pre_computed_amax.dtype,
+                clamp_amax=False,
+            )
+            float8_tensor = Float8Tensor.to_float8(
+                self._tensor, scale, torch.float8_e4m3fn, mm_config=self._mm_config
+            )
+        else:
+            float8_tensor = cast_to_float8_e4m3fn(
+                self._tensor, self._mm_config, reduce_amax=True
+            )
         return (float8_tensor._data,), (float8_tensor._scale,)
 
     def fsdp_post_all_gather(

diff --git a/float8_experimental/float8_linear_utils.py b/float8_experimental/float8_linear_utils.py
@@ -5,16 +5,22 @@
 # LICENSE file in the root directory of this source tree.
 import copy
 import logging
+
+import math
+import warnings
 from enum import auto, Enum
 from typing import Callable, List, Optional, Type
 
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-from float8_experimental.float8_dynamic_linear import Float8DynamicLinear
+from float8_experimental.float8_dynamic_linear import (
+    Float8DynamicLinear,
+    WeightWithDynamicFloat8CastTensor,
+)
 from float8_experimental.float8_linear import Float8Linear
 
-from float8_experimental.float8_utils import amax_history_to_scale_stack
+from float8_experimental.float8_utils import amax_history_to_scale_stack, EPS
 from torch.distributed._functional_collectives import all_reduce, AsyncCollectiveTensor
 
 log = logging.getLogger(__name__)
@@ -322,3 +328,34 @@ def inner_func():
     for child in fp8_layers:
         # Set a flag to signal amaxes/scales are ready
         child.amax_and_scale_synced = True
+
+
+def precompute_float8_amax(module: nn.Module) -> None:
+    from torch.distributed._tensor import DTensor
+
+    if any(isinstance(m, Float8Linear) for m in module.modules()):
+        raise NotImplementedError("Only supports Float8DynamicLinear, not Float8Linear")
+    float8_linears: List[Float8DynamicLinear] = [
+        m
+        for m in module.modules()
+        if isinstance(m, Float8DynamicLinear)
+        and isinstance(m.weight, DTensor)
+        and isinstance(m.weight._local_tensor, WeightWithDynamicFloat8CastTensor)
+    ]
+    weights: List[DTensor] = [float8_linear.weight for float8_linear in float8_linears]
+
+    def compute_amaxes(weights: List[DTensor]):
+        max_weights = torch._foreach_norm(weights, ord=math.inf)
+        amax_tensor = torch.vstack(max_weights)
+        amax_tensor = torch.clamp(amax_tensor, EPS)  # R
+        amaxes = torch.split(amax_tensor, 1)  # R
+        return amaxes
+
+    if weights:
+        amaxes = compute_amaxes(weights)
+        for amax, float8_linear in zip(amaxes, float8_linears):
+            float8_linear.weight._local_tensor._pre_computed_amax = amax._local_tensor
+    else:
+        warnings.warn(
+            "Calling precompute_float8_weights without any weights using FSDP fp8 all-gather!"
+        )
diff --git a/float8_experimental/float8_utils.py b/float8_experimental/float8_utils.py
@@ -27,17 +27,24 @@
 
 @torch.no_grad()
 def amax_to_scale(
-    amax: torch.Tensor, float8_dtype: torch.dtype, orig_dtype: torch.dtype
+    amax: torch.Tensor,
+    float8_dtype: torch.dtype,
+    orig_dtype: torch.dtype,
+    clamp_amax: bool = True,
 ):
     """Converts the amax value of a tensor to the fp8 scale.
     Args:
         amax: The amax value of the tensor.
         float8_dtype: The float8 dtype.
         orig_dtype: The original dtype of the tensor.
+        clamp_amax: default is True. False for FSDP fp8 all-gather since FSDP applied `torch.clamp` during pre-compute after optimizer.step
     """
     scale = torch.empty_like(amax, dtype=torch.float32)
     if float8_dtype in FP8_TYPES:
-        res = torch.finfo(float8_dtype).max / torch.clamp(amax, min=EPS)
+        if clamp_amax:
+            res = torch.finfo(float8_dtype).max / torch.clamp(amax, min=EPS)
+        else:
+            res = torch.finfo(float8_dtype).max / amax
     else:
         raise ValueError(f"Unsupported float8_dtype: {float8_dtype}")
 

diff --git a/test/test_fsdp2/test_fsdp2_common.py b/test/test_fsdp2/test_fsdp2_common.py
@@ -6,8 +6,12 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+from float8_experimental.float8_dynamic_linear import Float8DynamicLinear
 from float8_experimental.float8_linear import Float8Linear
-from float8_experimental.float8_linear_utils import sync_float8_amax_and_scale_history
+from float8_experimental.float8_linear_utils import (
+    precompute_float8_amax,
+    sync_float8_amax_and_scale_history,
+)
 
 
 def check_parity_no_mp(
@@ -18,6 +22,7 @@ def check_parity_no_mp(
     fsdp_optim: torch.optim.Optimizer,
     local_inp: torch.Tensor,
     module_cls: Type,
+    pre_compute: bool = False,
 ):
     for iter_idx in range(10):
         losses: List[torch.Tensor] = []
@@ -32,6 +37,12 @@ def check_parity_no_mp(
             if module_cls is Float8Linear:
                 sync_float8_amax_and_scale_history(model)
             optim.step()
+            if (
+                model is fsdp_model
+                and module_cls is Float8DynamicLinear
+                and pre_compute
+            ):
+                precompute_float8_amax(model)
         test_cls.assertEqual(losses[0], losses[1])
 
 

diff --git a/test/test_fsdp2/test_fsdp2_eager.py b/test/test_fsdp2/test_fsdp2_eager.py
@@ -85,10 +85,21 @@ def world_size(self) -> int:
 
     @skip_if_lt_x_gpu(2)
     def test_transformer_parity_dynamic(self):
-        for enable_fsdp_fp8_all_gather in [False, True]:
-            self._test_transformer_parity_dynamic(enable_fsdp_fp8_all_gather)
+        self.run_subtests(
+            {
+                "enable_fsdp_fp8_all_gather": [False, True],
+                "pre_compute": [False, True],
+            },
+            self._test_transformer_parity_dynamic,
+        )
 
-    def _test_transformer_parity_dynamic(self, enable_fsdp_fp8_all_gather: bool):
+    def _test_transformer_parity_dynamic(
+        self,
+        enable_fsdp_fp8_all_gather: bool,
+        pre_compute: bool,
+    ):
+        if not enable_fsdp_fp8_all_gather and pre_compute:
+            return
         # NOTE: Weight-tying does not compose with fp8 all-gather because the
         # embedding weight and output linear weight are tied but only the
         # latter uses fp8 compute. With fp8 all-gather, FSDP would pre-cast to
@@ -109,7 +120,14 @@ def _test_transformer_parity_dynamic(self, enable_fsdp_fp8_all_gather: bool):
             0, ref_module.tok_embeddings.weight.size(0), (16, 16), device="cuda"
         )
         check_parity_no_mp(
-            self, ref_module, ref_optim, module, optim, local_inp, Float8DynamicLinear
+            self,
+            ref_module,
+            ref_optim,
+            module,
+            optim,
+            local_inp,
+            Float8DynamicLinear,
+            pre_compute,
         )
 
     @skip_if_lt_x_gpu(2)