OpenBMB · a710128 · Jan 22, 2023 · Dec 22, 2022
diff --git a/bmtrain/inspect/model.py b/bmtrain/inspect/model.py
@@ -100,8 +100,8 @@ def inspect_pipeline_transformer_block_list(pipe_model: PipelineTransformerBlock
                             "shape": tuple(shape),
                             "std": p.std().cpu().item(),
                             "mean": p.mean().cpu().item(),
-                            "grad_std": None,
-                            "grad_mean": None,
+                            "grad_std": 0.,
+                            "grad_mean": 0.,
                             "max": p.max().cpu().item(),
                             "min": p.min().cpu().item(),
                         }
@@ -180,8 +180,8 @@ def inspect_checkpoint_block(model : CheckpointBlock, param_name : str, prefix :
                     "shape": tuple(shape),
                     "std": p.std().cpu().item(),
                     "mean": p.mean().cpu().item(),
-                    "grad_std": None,
-                    "grad_mean": None,
+                    "grad_std": 0.,
+                    "grad_mean": 0.,
                     "max": p.max().cpu().item(),
                     "min": p.min().cpu().item(),
                 })
@@ -236,8 +236,8 @@ def inspect_model(model : torch.nn.Module, param_name : str, prefix : str = ''):
                     stats["grad_std"] = g.std().cpu().item()
                     stats["grad_mean"] = g.mean().cpu().item()
                 else:
-                    stats["grad_std"] = None
-                    stats["grad_mean"] = None
+                    stats["grad_std"] = 0.
+                    stats["grad_mean"] = 0.
                 ret.append(stats)
         for name, module in model._modules.items():
             ret.extend(inspect_model(module, param_name, prefix + name + '.'))

diff --git a/tests/test_all.py b/tests/test_all.py
@@ -8,6 +8,7 @@
     ("init_parameters_multi_gpu", 4),
 
     ("requires_grad", 1),
+    ("requires_grad_multi_gpu", 2),
     ("has_inf_nan", 1),
     ("dropout", 1),
     ("loss_func", 1),

diff --git a/tests/test_init_parameters.py b/tests/test_init_parameters.py
@@ -168,10 +168,12 @@ def test_main():
     manual_seed(33)
     m[2] = Linear_BMTInitializer(*shape)
     bmt.init_parameters(m[2])
+    bmt.synchronize()
     ret[2] = (m[2].weight.data, m[2].bias.data)
 
     manual_seed(33)
     m[3] = Linear_ManualInitBefore(*shape)
+    bmt.synchronize()
     ret[3] = (m[3].weight.data, m[3].bias.data)
 
     # manual_seed(33)
@@ -211,6 +213,7 @@ def test_main():
         print(ret[i])
     for i in range(10):
         for j in range(10):
+            print(i, j)
             assert_all_eq(ret[i][0], ret[j][0])
             assert_all_eq(ret[i][1], ret[j][1])
 

diff --git a/tests/test_inspector_grad.py → tests/test_inspector_hidden.py b/tests/test_inspector_grad.py → tests/test_inspector_hidden.py
@@ -189,7 +189,7 @@ def sub_run(name, cls, num_layer, dim, batch, seq_len):
         inspector.get_summary()
     ) + "\n"
 
-    return ret.replace("None  ", "0.0000") + "\n" # replace for matching None grad with zero_grad
+    return ret + "\n" # replace for matching None grad with zero_grad
 
 def run(name, cls, num_layer=4, dim=4096, batch=32, seq_len=256):
     ret = ""
@@ -215,8 +215,7 @@ def test_main():
                 assert len(words) == len(words2)
                 for w, w2 in zip(words, words2):
                     try:
-                        if isinstance(eval(w), float):
-                            is_float = True
+                        is_float = isinstance(eval(w), float)
                     except:
                         is_float = False
                     if is_float:

diff --git a/tests/test_middle_hidden.py b/tests/test_middle_hidden.py
@@ -168,7 +168,7 @@ def sub_run(name, cls, num_layer, dim, batch, seq_len, only_last=False, only_mid
         ret += bmt.inspect.format_summary(
             bmt.inspect.inspect_model(m, '*')
         )
-    return ret.replace("None  ", "0.0000") + "\n" # replace for matching None grad with zero_grad
+    return ret + "\n" # replace for matching None grad with zero_grad
 
 def run(name, cls, num_layer=4, dim=4096, batch=32, seq_len=256):
     ret = ""
@@ -181,16 +181,30 @@ def run(name, cls, num_layer=4, dim=4096, batch=32, seq_len=256):
     return ret
 
 def test_main():
-    ret = []
-    ret.append( run("normal", Model_NORMAL) )
-    ret.append( run("block", Model_BLOCK) )
-    ret.append( run("zero", Model_ZERO) )
-    ret.append( run("pipe", Model_PIPE) )
-    for r in ret:
+    ret = {}
+    ret["normal"] = run("normal", Model_NORMAL)
+    ret["block"] = run("block", Model_BLOCK)
+    ret["zero"] = run("zero", Model_ZERO)
+    ret["pipe"] = run("pipe", Model_PIPE)
+    for k, r in ret.items():
+        bmt.print_rank(f"============={k}============")
         bmt.print_rank(r)
-    for r in ret:
-        for r2 in ret:
-            assert_eq(r, r2)
+    for r in ret.values():
+        for r2 in ret.values():
+            lines, lines2 = r.split('\n'), r2.split('\n')
+            assert len(lines) == len(lines2)
+            for line, line2 in zip(lines, lines2):
+                words, words2 = line.split(), line2.split()
+                assert len(words) == len(words2)
+                for w, w2 in zip(words, words2):
+                    try:
+                        is_float = isinstance(eval(w), float)
+                    except:
+                        is_float = False
+                    if is_float:
+                        assert_lt(abs(float(w)-float(w2)), 2.)
+                    else:
+                        assert_eq(w, w2)
 
 if __name__ == "__main__":
     bmt.init_distributed(pipe_size=4)

diff --git a/tests/test_other_hidden.py b/tests/test_other_hidden.py
@@ -159,7 +159,7 @@ def sub_run(name, cls, num_layer, dim, batch, seq_len, only_pre=False, only_post
         ret += bmt.inspect.format_summary(
             bmt.inspect.inspect_model(m, '*')
         )
-    return ret.replace("None  ", "0.0000") + "\n" # replace for matching None grad with zero_grad
+    return ret + "\n" # replace for matching None grad with zero_grad
 
 def run(name, cls, num_layer=4, dim=4096, batch=32, seq_len=256):
     ret = ""

diff --git a/tests/test_requires_grad.py b/tests/test_requires_grad.py
@@ -4,6 +4,7 @@
 import torch
 from bmtrain import config
 from bmtrain.block_layer import CheckpointBlockContext,  CheckpointBlock, TransformerBlockList
+from bmtrain.pipe_layer import PipelineTransformerBlockList
 from typing import List
 import torch.nn.functional as F
 
@@ -67,7 +68,36 @@ def test_main():
     assert_neq(sm2.split('\n')[1], sm3.split('\n')[1])
     assert_eq(sm2.split('\n')[2], sm3.split('\n')[2])
 
+def test_main_pipe():
+    a = Linear(256, 256)
+    b = Linear(256, 256)
+    m = PipelineTransformerBlockList([CheckpointBlock(a), CheckpointBlock(b)])
+    bmt.init_parameters(m)
+
+    a.bias.requires_grad_(False)
+    awg, abg, sm1 = run(m, a, b)
+    print(awg, abg, sm1)
+    assert_eq((awg, abg), (False, True))
+    assert_eq(sm1.split('\n')[2].split()[-2:], ["0.0000", "0.0000"])
+
+    a.weight.requires_grad_(False)
+    a.bias.requires_grad_(True)
+    awg, abg, sm2 = run(m, a, b)
+    print(awg, abg, sm2)
+    assert_eq((awg, abg), (False, False))
+    assert_eq(sm1.split('\n')[1], sm2.split('\n')[1])
+    assert_neq(sm1.split('\n')[2], sm2.split('\n')[2])
+
+    a.weight.requires_grad_(True)
+    a.bias.requires_grad_(False)
+    awg, abg, sm3 = run(m, a, b)
+    print(awg, abg, sm3)
+    assert_eq((awg, abg), (False, False))
+    assert_neq(sm2.split('\n')[1], sm3.split('\n')[1])
+    assert_eq(sm2.split('\n')[2], sm3.split('\n')[2])
+
 if __name__ == "__main__":
-    bmt.init_distributed()
+    bmt.init_distributed(pipe_size=1)
 
-    test_main()
+    test_main()
+    test_main_pipe()
diff --git a/tests/test_requires_grad_multi_gpu.py b/tests/test_requires_grad_multi_gpu.py
@@ -0,0 +1,96 @@
+from utils import *
+
+import bmtrain as bmt
+import torch
+from bmtrain import config
+from bmtrain.block_layer import CheckpointBlockContext,  CheckpointBlock, TransformerBlockList
+from bmtrain.pipe_layer import PipelineTransformerBlockList
+from typing import List
+import torch.nn.functional as F
+
+class Linear(bmt.DistributedModule):
+    def __init__(self, in_features : int, out_features: int, init_weight = None, init_bias = None) -> None:
+        super().__init__()
+
+        self.in_features = in_features
+        self.out_features = out_features
+        self.out = {}
+        if init_weight:
+            self.weight = bmt.DistributedParameter(torch.tensor(init_weight, dtype=torch.float, device="cuda").reshape(out_features, in_features))
+        else:
+            self.weight = bmt.DistributedParameter(torch.empty(out_features, in_features, dtype=torch.float, device="cuda"), init_method=torch.nn.init.xavier_normal_)
+
+        if init_bias:
+            self.bias = bmt.DistributedParameter(torch.tensor(init_bias, dtype=torch.float, device="cuda").reshape(out_features,))
+        else:
+            self.bias = bmt.DistributedParameter(torch.empty(out_features, dtype=torch.float, device="cuda"), init_method=torch.nn.init.zeros_)
+
+    def forward(self, input):
+        ret = F.linear(input, self.weight, self.bias)
+        return ret
+
+def run(m, a, b):
+    inp = torch.rand((1, 10, 256)).cuda()*100
+    logits = m(inp)
+    loss = logits.sum()
+    loss.backward()
+
+    sm = bmt.inspect.format_summary(
+            bmt.inspect.inspect_model(m, '*')
+        )
+    return sm
+
+def test_main():
+    a = Linear(256, 256)
+    b = Linear(256, 256)
+    m = TransformerBlockList([CheckpointBlock(a), CheckpointBlock(b)])
+    bmt.init_parameters(m)
+
+    a.bias.requires_grad_(False)
+    sm1 = run(m, a, b)
+    print(sm1)
+    assert_eq(sm1.split('\n')[2].split()[-2:], ["0.0000", "0.0000"])
+
+    a.weight.requires_grad_(False)
+    a.bias.requires_grad_(True)
+    sm2 = run(m, a, b)
+    print(sm2)
+    assert_eq(sm1.split('\n')[1], sm2.split('\n')[1])
+    assert_neq(sm1.split('\n')[2], sm2.split('\n')[2])
+
+    a.weight.requires_grad_(True)
+    a.bias.requires_grad_(False)
+    sm3 = run(m, a, b)
+    assert_neq(sm2.split('\n')[1], sm3.split('\n')[1])
+    assert_eq(sm2.split('\n')[2], sm3.split('\n')[2])
+
+def test_main_pipe():
+    a = Linear(256, 256)
+    b = Linear(256, 256)
+    m = PipelineTransformerBlockList([CheckpointBlock(a), CheckpointBlock(b)])
+    bmt.init_parameters(m)
+
+    a.bias.requires_grad_(False)
+    sm1 = run(m, a, b)
+    print(sm1)
+    assert_eq(sm1.split('\n')[2].split()[-2:], ["0.0000", "0.0000"])
+
+    a.weight.requires_grad_(False)
+    a.bias.requires_grad_(True)
+    sm2 = run(m, a, b)
+    print(sm2)
+    assert_eq(sm1.split('\n')[1], sm2.split('\n')[1])
+    assert_neq(sm1.split('\n')[2], sm2.split('\n')[2])
+
+    a.weight.requires_grad_(True)
+    a.bias.requires_grad_(False)
+    sm3 = run(m, a, b)
+    print(sm3)
+    assert_neq(sm2.split('\n')[1], sm3.split('\n')[1])
+    assert_eq(sm2.split('\n')[2], sm3.split('\n')[2])
+
+if __name__ == "__main__":
+    bmt.init_distributed(pipe_size=2)
+
+    test_main()
+    test_main_pipe()