huggingface · zucchini-nlp · Aug 13, 2024 · May 20, 2024 · May 22, 2024 · May 22, 2024
diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -1767,12 +1767,25 @@ def forward(
             language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
         )
         inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
-        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
-
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
-        expected_device = language_model_attention_mask.device
-        attention_mask = torch.cat([language_model_attention_mask, attention_mask.to(expected_device)], dim=1)
+
+        # if the model already has "image_token_index" then the input is expanded to account for image embeds
+        # otherwise we expand manually by concating
+        if hasattr(self.config, "image_token_index"):
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs)
+        else:
+            logger.warning_once(
+                "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
+                "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                "Using processors without these attributes in the config is deprecated and will throw an error in a later version"
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                "Using processors without these attributes in the config is deprecated and will throw an error in a later version"
+            )
+            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+            attention_mask = torch.cat(
+                [language_model_attention_mask, attention_mask.to(language_model_attention_mask.device)], dim=1
+            )
 
         if self.config.use_decoder_only_language_model:
             outputs = self.language_model(
@@ -1876,13 +1889,25 @@ def generate(
                 .repeat(batch_size, 1)
                 .to(image_embeds.device)
             )
+        inputs_embeds = self.get_input_embeddings()(input_ids)
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
-        attention_mask = torch.cat([language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1)
 
-        # concatenate query embeddings with prompt embeddings
-        inputs_embeds = self.get_input_embeddings()(input_ids)
-        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+        # if the model already has "image_token_index" then the input is expanded to account for image embeds
+        # otherwise we expand manually by concatenating
+        if hasattr(self.config, "image_token_index"):
+            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1).expand_as(inputs_embeds)
+            inputs_embeds[special_image_mask] = language_model_inputs.flatten()
+        else:
+            logger.warning_once(
+                "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
+                "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+            )
+            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
+            attention_mask = torch.cat(
+                [language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1
+            )
 
         # add image_embeds length to max_length, so that the final max_length in counted only on token embeds
         # -1 is to account for the prepended BOS after `generate.`

diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
@@ -20,8 +20,18 @@
 
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType
+from ...tokenization_utils_base import (
+    AddedToken,
+    BatchEncoding,
+    PaddingStrategy,
+    PreTokenizedInput,
+    TextInput,
+    TruncationStrategy,
+)
+from ...utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
 
 
 class Blip2Processor(ProcessorMixin):
@@ -36,20 +46,25 @@ class Blip2Processor(ProcessorMixin):
             An instance of [`BlipImageProcessor`]. The image processor is a required input.
         tokenizer (`AutoTokenizer`):
             An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+        num_query_tokens (`int`, *optional*):
+            MNumber of tokens used by the Qformer as queries, should be same as in model's config.
     """
 
     attributes = ["image_processor", "tokenizer"]
     valid_kwargs = []
     image_processor_class = "BlipImageProcessor"
     tokenizer_class = "AutoTokenizer"
 
-    # Copied from transformers.models.blip.processing_blip.BlipProcessor.__init__
-    def __init__(self, image_processor, tokenizer, **kwargs):
+    def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs):
         tokenizer.return_token_type_ids = False
+        self.current_processor = image_processor
+        self.image_token = AddedToken("<image>", normalized=False, special=True)
+        tokens_to_add = {"additional_special_tokens": [self.image_token]}
+        tokenizer.add_special_tokens(tokens_to_add)
+        self.num_query_tokens = num_query_tokens
+
         super().__init__(image_processor, tokenizer)
-        self.current_processor = self.image_processor
 
-    # Copied from transformers.models.blip.processing_blip.BlipProcessor.__call__
     def __call__(
         self,
         images: ImageInput = None,
@@ -106,7 +121,13 @@ def __call__(
         encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
 
         if text is not None:
-            text_encoding = self.tokenizer(
+            if isinstance(text, str):
+                text = [text]
+            elif not isinstance(text, list) and not isinstance(text[0], str):
+                raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+            text_encoding = {}
+            _text_encoding = self.tokenizer(
                 text=text,
                 add_special_tokens=add_special_tokens,
                 padding=padding,
@@ -121,9 +142,30 @@ def __call__(
                 return_token_type_ids=return_token_type_ids,
                 return_length=return_length,
                 verbose=verbose,
-                return_tensors=return_tensors,
+                return_tensors=None,  # hardcode "None" here for prepending image tokens
                 **kwargs,
             )
+
+            # if we know how many query tokens, expand text inside processor. We need this hacky manipulation
+            # because BLIP expects image tokens to be at the beginning even before BOS token
+            if self.num_query_tokens is not None:
+                image_tokens = self.image_token.content * self.num_query_tokens
+                image_token_encoding = self.tokenizer([image_tokens], add_special_tokens=False, return_tensors=None)
+                for k in _text_encoding:
+                    text_encoding[k] = [
+                        img_encoding + txt_encoding
+                        for img_encoding, txt_encoding in zip(image_token_encoding[k], _text_encoding[k])
+                    ]
+            else:
+                text_encoding = _text_encoding
+                logger.warning_once(
+                    "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
+                    "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.44."
+                )
+
+            # cast to desired return tensors type
+            text_encoding = BatchEncoding(text_encoding, tensor_type=return_tensors)
         else:
             text_encoding = None
 

diff --git a/src/transformers/models/llava/configuration_llava.py b/src/transformers/models/llava/configuration_llava.py
@@ -48,6 +48,8 @@ class LlavaConfig(PretrainedConfig):
             Can be one of `"default"` or `"full"`.
         vision_feature_layer (`int`, *optional*, defaults to -2):
             The index of the layer to select the vision feature.
+        image_seq_length (`int`, *optional*, defaults to 576):
+            Sequence length of one image embedding.
 
     Example:
 
@@ -82,11 +84,13 @@ def __init__(
         projector_hidden_act="gelu",
         vision_feature_select_strategy="default",
         vision_feature_layer=-2,
+        image_seq_length=576,
         **kwargs,
     ):
         self.ignore_index = ignore_index
         self.image_token_index = image_token_index
         self.projector_hidden_act = projector_hidden_act
+        self.image_seq_length = image_seq_length
 
         if vision_feature_select_strategy not in ["default", "full"]:
             raise ValueError(