huggingface · zucchini-nlp · Aug 7, 2024 · Jun 14, 2024 · Jun 14, 2024 · Jun 14, 2024
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -930,7 +930,9 @@ def __init__(self, config: PretrainedConfig, max_batch_size: int, max_cache_len:
 
         self.dtype = dtype if dtype is not None else torch.float32
         self.num_key_value_heads = (
-            config.num_attention_heads if config.num_key_value_heads is None else config.num_key_value_heads
+            config.num_attention_heads
+            if getattr(config, "num_key_value_heads", None) is None
+            else config.num_key_value_heads
         )
 
         self.key_cache: List[torch.Tensor] = []

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
@@ -1470,7 +1470,7 @@ def _get_cache(self, cache_implementation: str, max_batch_size: int, max_cache_l
                     # NOTE: self.dtype is not compatible with torch.compile, as it calls `self.parameters()`.
                     # Workaround: trust the lm_head, whose attribute name is somewhat consistent across generative
                     # models. May cause trobles with non-text modalities.
-                    cache_dtype = self.lm_head.weight.dtype
+                    cache_dtype = self.get_output_embeddings().weight.dtype
 
             cache_kwargs = {
                 "config": self.config,

diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py
diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py
diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
@@ -59,7 +59,7 @@
         ImageGPTForCausalImageModeling,
         SpeechEncoderDecoderModel,
     )
-    from transformers.cache_utils import DynamicCache, EncoderDecoderCache, QuantoQuantizedCache
+    from transformers.cache_utils import DynamicCache, EncoderDecoderCache, QuantoQuantizedCache, StaticCache
     from transformers.generation import (
         BeamSampleDecoderOnlyOutput,
         BeamSampleEncoderDecoderOutput,
@@ -1769,6 +1769,53 @@ def test_new_cache_format(self, num_beams, do_sample):
                         )
                     )
 
+    def test_generate_with_static_cache(self):
+        """
+        Tests if StaticCache works if we set attn_implementation=static when generation.
+        This doesn't test if generation quality is good, but tests that models with
+        self._supports_static_cache don't throw an error when generating and return
+        a StaticCache object at the end.
+        """
+        for model_class in self.all_generative_model_classes:
+            if not model_class._supports_static_cache:
+                self.skipTest(reason="This model does not support the static cache format")
+
+            config, input_ids, attention_mask = self._get_input_ids_and_config()
+            if config.is_encoder_decoder:
+                self.skipTest(reason="This model is encoder-decoder and has Encoder-Decoder Cache")
+
+            config.use_cache = True
+            config.is_decoder = True
+            batch_size, seq_length = input_ids.shape
+            max_new_tokens = 20
+
+            model = model_class(config).to(torch_device).eval()
+            generation_kwargs = {
+                "max_length": None,
+                "max_new_tokens": max_new_tokens,
+                "cache_implementation": "static",
+                "return_dict_in_generate": True,  # Required to return `past_key_values`
+            }
+
+            max_cache_len = seq_length + max_new_tokens
+            head_dim = (
+                model.config.head_dim
+                if hasattr(model.config, "head_dim")
+                else model.config.hidden_size // model.config.num_attention_heads
+            )
+            num_key_value_heads = (
+                model.config.num_attention_heads
+                if getattr(config, "num_key_value_heads", None) is None
+                else model.config.num_key_value_heads
+            )
+            num_hidden_layers = config.num_hidden_layers
+            results = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
+
+            cache_shape = (batch_size, num_key_value_heads, max_cache_len, head_dim)
+            self.assertTrue(isinstance(results.past_key_values, StaticCache))
+            self.assertTrue(len(results.past_key_values.key_cache) == num_hidden_layers)
+            self.assertTrue(results.past_key_values.key_cache[0].shape == cache_shape)
+
     @require_quanto
     def test_generate_with_quant_cache(self):
         for model_class in self.all_generative_model_classes:

diff --git a/tests/models/phi3/test_modeling_phi3.py b/tests/models/phi3/test_modeling_phi3.py
@@ -16,6 +16,7 @@
 """Testing suite for the PyTorch Phi-3 model."""
 
 import unittest
+from typing import List
 
 from parameterized import parameterized
 
@@ -69,7 +70,7 @@ def forward(
             ).logits
 
         @staticmethod
-        def generate(model: Phi3ForCausalLM, prompt_tokens: torch.LongTensor, max_seq_len: int) -> list[int]:
+        def generate(model: Phi3ForCausalLM, prompt_tokens: torch.LongTensor, max_seq_len: int) -> List[int]:
             model = Phi3MiniWithStaticCache(model, 1, max_seq_len + prompt_tokens.shape[-1])
 
             response_tokens = []

diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
@@ -4587,6 +4587,44 @@ def test_custom_4d_attention_mask(self):
             normalized_1 = F.softmax(out_shared_prefix_last_tokens)
             torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4)
 
+    def test_static_cache_matches_dynamic(self):
+        """
+        Tests that generating with static cache give almost same results as with dynamic cache.
+        This test does not compile the model and check only logits similarity for numerical precision
+        errors.
+        """
+        if len(self.all_generative_model_classes) == 0:
+            self.skipTest(
+                reason="Model architecture has no generative classes, and thus not necessarily supporting 4D masks"
+            )
+
+        for model_class in self.all_generative_model_classes:
+            if not model_class._supports_static_cache:
+                self.skipTest(f"{model_class.__name__} does not support static cache")
+
+            if not model_class._supports_cache_class:
+                self.skipTest(f"{model_class.__name__} does not support cache class")
+
+            config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+            if getattr(config, "sliding_window", 0) > 0:
+                self.skipTest(f"{model_class.__name__} with sliding window attention is not supported by this test")
+
+            model = model_class(config).to(device=torch_device, dtype=torch.float32)
+            model.eval()
+
+            dynamic_out = model.generate(
+                **inputs, do_sample=False, max_new_tokens=10, output_logits=True, return_dict_in_generate=True
+            )
+            static_out = model.generate(
+                **inputs,
+                do_sample=False,
+                max_new_tokens=10,
+                cache_implementation="static",
+                output_logits=True,
+                return_dict_in_generate=True,
+            )
+            self.assertTrue(torch.allclose(dynamic_out.logits[0], static_out.logits[0], rtol=1e-3, atol=1e-4))
+
     # For now, Let's focus only on GPU for `torch.compile`
     @slow
     @require_torch_gpu