From 9a7a68f298c2cfebbd4ee1657d0788819a89a3de Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 18 Jul 2023 14:47:06 +0000 Subject: [PATCH 1/3] fix dtype issue --- src/transformers/models/instructblip/modeling_instructblip.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index 1254ef558f6b1b..15262832f0c281 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -659,13 +659,14 @@ def forward( attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key attention_scores = attention_scores / math.sqrt(self.attention_head_size) + attention_scores_dtype = attention_scores.dtype if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in BertModel forward() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.Softmax(dim=-1)(attention_scores.float()).to(attention_scores_dtype) if is_cross_attention and self.save_attention: self.save_attention_map(attention_probs) @@ -1038,6 +1039,7 @@ def forward( else: embeddings = query_embeds + embeddings = embeddings.to(self.layernorm.weight.dtype) embeddings = self.layernorm(embeddings) embeddings = self.dropout(embeddings) return embeddings From a26f4b42938e12d253d65a4d91a5c6815234f7ec Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 18 Jul 2023 14:52:14 +0000 Subject: [PATCH 2/3] revert `.float()` --- src/transformers/models/instructblip/modeling_instructblip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index 15262832f0c281..53685a06b1e16d 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -666,7 +666,7 @@ def forward( attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores.float()).to(attention_scores_dtype) + attention_probs = nn.Softmax(dim=-1)(attention_scores).to(attention_scores_dtype) if is_cross_attention and self.save_attention: self.save_attention_map(attention_probs) From 439bf22c85bc451c7206011ab62e249940f74b24 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 18 Jul 2023 14:56:00 +0000 Subject: [PATCH 3/3] fix copies --- src/transformers/models/instructblip/modeling_instructblip.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index 53685a06b1e16d..645c38c2046273 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -558,7 +558,6 @@ def get_input_embeddings(self): return self.embeddings -# Copied from transformers.models.blip_2.modeling_blip_2.Blip2QFormerMultiHeadAttention with Blip2->InstructBlip class InstructBlipQFormerMultiHeadAttention(nn.Module): def __init__(self, config, is_cross_attention=False): super().__init__()