huggingface · fxmarty · Mar 22, 2024 · Mar 1, 2024 · Mar 1, 2024 · Mar 18, 2024
diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
@@ -444,11 +444,11 @@ def forward(
                     # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
                     batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
 
-                    # Get the target length
-                    target_seqlen = first_layer_past_key_value.shape[-1] + 1
+                    target_length = input_ids.shape[1]
+                    past_length = first_layer_past_key_value.shape[-1]
 
                     extended_attention_mask = torch.ones(
-                        (attention_mask.shape[0], target_seqlen - attention_mask.shape[1]),
+                        (attention_mask.shape[0], past_length),
                         dtype=attention_mask.dtype,
                         device=attention_mask.device,
                     )
@@ -463,7 +463,7 @@ def forward(
                     # Zero-out the places where we don't need to attend
                     extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
 
-                    attention_mask = torch.cat((attention_mask, extended_attention_mask), dim=1)
+                    attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
                     position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
 
         outputs = self.language_model(

diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -443,11 +443,11 @@ def forward(
                     # Sum all dimensions of head_dim (-1) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
                     batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-1) == 0)
 
-                    # Get the target length
-                    target_seqlen = first_layer_past_key_value.shape[-2] + 1
+                    target_length = input_ids.shape[1]
+                    past_length = first_layer_past_key_value.shape[-1]
 
                     extended_attention_mask = torch.ones(
-                        (attention_mask.shape[0], target_seqlen - attention_mask.shape[1]),
+                        (attention_mask.shape[0], past_length),
                         dtype=attention_mask.dtype,
                         device=attention_mask.device,
                     )
@@ -462,7 +462,7 @@ def forward(
                     # Zero-out the places where we don't need to attend
                     extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
 
-                    attention_mask = torch.cat((attention_mask, extended_attention_mask), dim=1)
+                    attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1)
                     position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
 
         outputs = self.language_model(