diff --git a/applications/Chat/requirements.txt b/applications/Chat/requirements.txt
index e78b203029a5..cfe82e706494 100644
--- a/applications/Chat/requirements.txt
+++ b/applications/Chat/requirements.txt
@@ -1,4 +1,4 @@
-transformers>=4.20.1
+transformers>=4.33.1
 tqdm
 datasets
 loralib
diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index bc99be4cc391..00e4b07ef744 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -58,6 +58,7 @@ def gpt2_model_forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         logger = logging.get_logger(__name__)
+        print("attention_mask_input" + str(attention_mask.shape))
 
         # Preprocess passed in arguments
         # TODO(baizhou): left the recording kv-value tensors as () or None type, this feature may be added in the future.
@@ -94,9 +95,9 @@ def gpt2_model_forward(
             if hidden_states is None:
                 raise ValueError("hidden_states shouldn't be None for stages other than the first stage.")
             input_shape = hidden_states.size()[:-1]
-            batch_size = input_shape[0]
             device = hidden_states.device
             hidden_states = hidden_states.view((-1,) + hidden_states.shape[-2:])
+            batch_size = hidden_states.shape[0]
 
         # GPT2Attention mask.
         if attention_mask is not None:
@@ -176,6 +177,7 @@ def gpt2_model_forward(
             block = self.h[i]
             torch.cuda.set_device(hidden_states.device)
             # Ensure that attention_mask is always on the same device as hidden_states
+            print("attention_mask_pp" + str(attention_mask.shape))
             if attention_mask is not None:
                 attention_mask = attention_mask.to(hidden_states.device)
             if isinstance(head_mask, torch.Tensor):
diff --git a/tests/test_shardformer/test_model/_utils.py b/tests/test_shardformer/test_model/_utils.py
index c9c6447a43f0..52df0ba4fe9e 100644
--- a/tests/test_shardformer/test_model/_utils.py
+++ b/tests/test_shardformer/test_model/_utils.py
@@ -150,12 +150,17 @@ def _criterion(outputs, inputs):
                 data[k] = v.repeat(input_shape[:-1] + (input_shape[-1] * times,))
 
     sharded_model.train()
+    for k, v in data.items():
+        if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__:
+            new_shape = [1] * v.dim()
+            new_shape[0] = 4
+            data[k] = v.to('cuda').repeat(*new_shape)
     if booster.plugin.stage_manager is not None:
-        for k, v in data.items():
-            if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__:
-                new_shape = [1] * v.dim()
-                new_shape[0] = 4
-                data[k] = v.to('cuda').repeat(*new_shape)
+        # for k, v in data.items():
+        #     if torch.is_tensor(v) or 'Tensor' in v.__class__.__name__:
+        #         new_shape = [1] * v.dim()
+        #         new_shape[0] = 4
+        #         data[k] = v.to('cuda').repeat(*new_shape)
 
         data_iter = iter([data])
         sharded_output = booster.execute_pipeline(data_iter,
diff --git a/tests/test_shardformer/test_model/test_shard_gpt2.py b/tests/test_shardformer/test_model/test_shard_gpt2.py
index c4cc3812dbfd..5f62a0d3d1e2 100644
--- a/tests/test_shardformer/test_model/test_shard_gpt2.py
+++ b/tests/test_shardformer/test_model/test_shard_gpt2.py
@@ -171,7 +171,7 @@ def run_gpt2_test(test_config):
     {
         'tp_size': 2,
         'pp_size': 2,
-        'num_microbatches': 4,
+        'num_microbatches': 2,
         'enable_all_optimization': False,
         'use_lazy_init': False,
         'precision': 'fp32',