OpenBMB · zkh2016 · Aug 21, 2023 · Jul 24, 2023 · Jul 25, 2023 · Jul 25, 2023
diff --git a/bmtrain/block_layer.py b/bmtrain/block_layer.py
@@ -7,7 +7,6 @@
 from .synchronize import wait_loader
 from .parameter import DistributedParameter, OpAllGather
 from .checkpointing import (
-        ScopedTensorInspectorContext,
         CheckpointBlockContext
 )
 
@@ -50,32 +49,6 @@ def _get_param_kw(param : DistributedParameter):
         group_name = "_g_" + param.group
     return type_name + grad_name + group_name
 
-class BMTBlockContext:
-    def __init__(self):
-        self._pre_module = None
-        self._first = True
-
-    def link_module(self, module):
-        if not self._first and module._ref_count == -1:
-            self._pre_module = module
-            module._ref_count = 1
-            return
-
-        if self._pre_module is None:
-            module._ref_count = 1
-            module._is_first_layer = True
-        else:
-            if module._ref_count == 0:
-                module._is_first_layer = False
-            self._pre_module.set_next_module(module)
-            self._pre_module._is_last_layer = False
-        self._pre_module = module
-        self._first = False
-
-    def clear(self):
-        self._pre_module = None
-        self._first = True
-
 class CheckpointBlock(torch.nn.Module):
     """ A bmtrain block containing two memory-saving methods of ZeRO-2/3 and checkpoint.
 
@@ -94,7 +67,7 @@ class CheckpointBlock(torch.nn.Module):
         >>> y2, ... = transformer_block(x)
         >>> assert torch.allclose(y1, y2)
     """
-    def __init__(self, inner_module : torch.nn.Module, use_checkpoint=True, block_context=None):
+    def __init__(self, inner_module : torch.nn.Module, use_checkpoint=True):
         super().__init__()
         self._module = inner_module
         self._inputs = None
@@ -222,25 +195,35 @@ def __init__(self, inner_module : torch.nn.Module, use_checkpoint=True, block_co
         self.use_checkpoint = use_checkpoint
         self._is_first_layer = True
         self._is_last_layer = True
-        self._pre_module = []
-        self._next_module = []
-        self._ref_count = 0
+        self._release_list = [True] 
+        self._next_module = [] #save the next module of self
+        self._pre_module = [] #save the pre module of self
+        self._ref_count = 0 #incremental in forward and  decreasing in backward
         self._mode = "BLOCK" #BLOCK or ZERO or PIPE
         self.return_hidden_states = False
         self.hidden_states = []
-        self.block_context = block_context
-        if block_context is None:
-            self.block_context = config['block_context'][config['rank']]
         self.all_input_no_grad = False
+        self.all_param_no_grad = False
 
-    def set_next_module(self, module):
-        self._next_module.append(module)
-        module._pre_module.append(self)
-        module._ref_count += 1
+    def set_pre_module(self, pre_module):
+        if pre_module is not None:
+            self._pre_module.append(pre_module)
+            pre_module._next_module.append(self)
+
+    def pre_module(self):
+        return self._pre_module[self._ref_count-1]
+
+    def next_module(self):
+        assert len(self._next_module) == self._ref_count, "{} != {}".format(len(self._next_module), self._ref_count)
+        return self._next_module[self._ref_count-1]
+
+    def backward_release(self, flag):
+        if self._ref_count == 1:
+            self._backward_block_ctx.exit(flag, True)
+            config['load_stream'].record_event(config['load_event'])
+        self._ref_count -= 1
 
     def pre_hook(self, *args):
-        if self._mode != "PIPE":
-            self.block_context.link_module(self)
         grad_tensors = []
         grad_index = []
         arg_list = list(args)
@@ -255,9 +238,11 @@ def pre_hook(self, *args):
             arg_list[grad_index[i]] = pre_out[i]
 
         if self._mode != "PIPE" and len(grad_tensors) == 0:
+            self.all_param_no_grad = True
             for param in self._param_info:
                 if param['parameter'].requires_grad:
                     param['parameter'].register_hook(lambda grad: hook_func.zero_post_backward(self, grad, None))
+                    self.all_param_no_grad = False
                     break
             self.all_input_no_grad = True
         else:
@@ -537,16 +522,23 @@ def __init__(self, modules: Iterable[CheckpointBlock], num_hidden=1, sqrt=False)
         super().__init__()
 
         self._modules = {}
+        release_list = []
+        pre_module = None
         for i, module in enumerate(modules):
             if not isinstance(module, CheckpointBlock):
                 module = CheckpointBlock(module)
 
             module._mode = "ZERO"
-            module._is_last_layer = True if i == len(modules) -1 else False
-            module._is_first_layer = True if i == 0 else False
+            module.set_pre_module(pre_module)
+            pre_module = module
+            self._is_first_layer = False
+            self._is_last_layer = False
 
             self._modules[str(i)] = module
             self.add_module(str(i), module)
+
+        self._modules[str(0)]._is_first_layer = True
+        self._modules[str(len(modules)-1)]._is_last_layer = True
 
         self.num_hidden = num_hidden
 

diff --git a/bmtrain/hook_func.py b/bmtrain/hook_func.py
@@ -25,20 +25,16 @@ def zero_post_forward(module, inputs, outputs):
 
     if exit:
         module._forward_block_ctx.exit(forward_flag)
+    if module._mode != "PIPE":
+        module._ref_count += 1
 
 def zero_pre_backward(module, grad_outputs):
     backward_flag = 2 if config['zero_level'] == 2 else 0
     if module._mode != "PIPE":
         module._backward_block_ctx = CheckpointBlockContext(module, module._layer_dict)
         module._backward_block_ctx.enter(backward_flag, True)
-        if not module._is_last_layer and len(module._next_module) > 0 and module._next_module[-1]._backward_block_ctx is not None:
-            if module._next_module[-1]._ref_count == 1:
-                module._next_module[-1]._ref_count = 0
-                module._next_module.pop()._backward_block_ctx.exit(backward_flag, True)
-                config['load_stream'].record_event(config['load_event'])
-            else:
-                module._next_module[-1]._ref_count -= 1
-
+        if not module._is_last_layer: 
+            module.next_module().backward_release(backward_flag)
     else:
         if module._micro_idx == config['micros'] - 1:
             module._backward_block_ctx = CheckpointBlockContext(module, module._layer_dict, pipe=True)
@@ -47,15 +43,10 @@ def zero_pre_backward(module, grad_outputs):
 def zero_post_backward(module, grad_inputs, grad_outputs):
     backward_flag = 2 if config['zero_level'] == 2 else 0
     if module._mode != "PIPE":
-        if module._is_first_layer and module._ref_count == 1:
-            module._backward_block_ctx.exit(backward_flag, True)
-            module._ref_count = -1
-            config['load_stream'].record_event(config['load_event'])
-        if not module._is_first_layer and len(module._pre_module) > 0:
-            module._pre_module.pop()
+        if module._is_first_layer: 
+            module.backward_release(backward_flag)
     else:
         if module._micro_idx == 0:
-            module._ref_count = -1 if module._is_first_layer else 0
             module._backward_block_ctx.exit(backward_flag, True)
             config['load_stream'].record_event(config['load_event'])
 

diff --git a/bmtrain/init.py b/bmtrain/init.py
@@ -7,7 +7,6 @@
 from .global_var import config
 from . import nccl
 from .synchronize import synchronize
-from .block_layer import BMTBlockContext
 
 def init_distributed(
         init_method : str = "env://",
@@ -74,9 +73,6 @@ def init_distributed(
     config["zero_level"] = zero_level
     config["topology"] = topology(config)
     config["zero_rank"] = config["topology"].get_group_rank("zero") if pipe_size > 1 else config['rank']
-    config["block_context"] = []
-    for i in range(world_size):
-        config["block_context"].append(BMTBlockContext())
     cpus_this_worker = None
 
     all_available_cpus = sorted(list(os.sched_getaffinity(0)))